JAVA也可以实现爬虫,比如jsoup包,一个非常方便解析html的工具呢。
创新互联建站服务项目包括曹县网站建设、曹县网站制作、曹县网页制作以及曹县网络营销策划等。多年来,我们专注于互联网行业,利用自身积累的技术优势、行业经验、深度合作伙伴关系等,向广大中小型企业、政府机构等提供互联网行业的解决方案,曹县网站推广取得了明显的社会效益与经济效益。目前,我们服务的客户以成都为中心已经辐射到曹县省份的部分城市,未来相信会继续扩大服务区域并继续获得客户的支持与信任!
不过相对来说,java语言笨重,稍微有些麻烦。
网络爬虫是一个自动提取网页的程序,它为搜索引擎从万维网上下载网页,是搜索引擎的重要组成。\x0d\x0a传统爬虫从一个或若干初始网页的URL开始,获得初始网页上的URL,在抓取网页的过程中,不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件。对于垂直搜索来说,聚焦爬虫,即有针对性地爬取特定主题网页的爬虫,更为适合。\x0d\x0a\x0d\x0a以下是一个使用java实现的简单爬虫核心代码:\x0d\x0apublic void crawl() throws Throwable { \x0d\x0a while (continueCrawling()) { \x0d\x0a CrawlerUrl url = getNextUrl(); //获取待爬取队列中的下一个URL \x0d\x0a if (url != null) { \x0d\x0a printCrawlInfo(); \x0d\x0a String content = getContent(url); //获取URL的文本信息 \x0d\x0a \x0d\x0a //聚焦爬虫只爬取与主题内容相关的网页,这里采用正则匹配简单处理 \x0d\x0a if (isContentRelevant(content, this.regexpSearchPattern)) { \x0d\x0a saveContent(url, content); //保存网页至本地 \x0d\x0a \x0d\x0a //获取网页内容中的链接,并放入待爬取队列中 \x0d\x0a Collection urlStrings = extractUrls(content, url); \x0d\x0a addUrlsToUrlQueue(url, urlStrings); \x0d\x0a } else { \x0d\x0a System.out.println(url + " is not relevant ignoring ..."); \x0d\x0a } \x0d\x0a \x0d\x0a //延时防止被对方屏蔽 \x0d\x0a Thread.sleep(this.delayBetweenUrls); \x0d\x0a } \x0d\x0a } \x0d\x0a closeOutputStream(); \x0d\x0a}\x0d\x0aprivate CrawlerUrl getNextUrl() throws Throwable { \x0d\x0a CrawlerUrl nextUrl = null; \x0d\x0a while ((nextUrl == null) (!urlQueue.isEmpty())) { \x0d\x0a CrawlerUrl crawlerUrl = this.urlQueue.remove(); \x0d\x0a //doWeHavePermissionToVisit:是否有权限访问该URL,友好的爬虫会根据网站提供的"Robot.txt"中配置的规则进行爬取 \x0d\x0a //isUrlAlreadyVisited:URL是否访问过,大型的搜索引擎往往采用BloomFilter进行排重,这里简单使用HashMap \x0d\x0a //isDepthAcceptable:是否达到指定的深度上限。爬虫一般采取广度优先的方式。一些网站会构建爬虫陷阱(自动生成一些无效链接使爬虫陷入死循环),采用深度限制加以避免 \x0d\x0a if (doWeHavePermissionToVisit(crawlerUrl) \x0d\x0a (!isUrlAlreadyVisited(crawlerUrl)) \x0d\x0a isDepthAcceptable(crawlerUrl)) { \x0d\x0a nextUrl = crawlerUrl; \x0d\x0a // System.out.println("Next url to be visited is " + nextUrl); \x0d\x0a } \x0d\x0a } \x0d\x0a return nextUrl; \x0d\x0a}\x0d\x0aprivate String getContent(CrawlerUrl url) throws Throwable { \x0d\x0a //HttpClient4.1的调用与之前的方式不同 \x0d\x0a HttpClient client = new DefaultHttpClient(); \x0d\x0a HttpGet httpGet = new HttpGet(url.getUrlString()); \x0d\x0a StringBuffer strBuf = new StringBuffer(); \x0d\x0a HttpResponse response = client.execute(httpGet); \x0d\x0a if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) { \x0d\x0a HttpEntity entity = response.getEntity(); \x0d\x0a if (entity != null) { \x0d\x0a BufferedReader reader = new BufferedReader( \x0d\x0a new InputStreamReader(entity.getContent(), "UTF-8")); \x0d\x0a String line = null; \x0d\x0a if (entity.getContentLength() 0) { \x0d\x0a strBuf = new StringBuffer((int) entity.getContentLength()); \x0d\x0a while ((line = reader.readLine()) != null) { \x0d\x0a strBuf.append(line); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a if (entity != null) { \x0d\x0a nsumeContent(); \x0d\x0a } \x0d\x0a } \x0d\x0a //将url标记为已访问 \x0d\x0a markUrlAsVisited(url); \x0d\x0a return strBuf.toString(); \x0d\x0a}\x0d\x0apublic static boolean isContentRelevant(String content, \x0d\x0aPattern regexpPattern) { \x0d\x0a boolean retValue = false; \x0d\x0a if (content != null) { \x0d\x0a //是否符合正则表达式的条件 \x0d\x0a Matcher m = regexpPattern.matcher(content.toLowerCase()); \x0d\x0a retValue = m.find(); \x0d\x0a } \x0d\x0a return retValue; \x0d\x0a}\x0d\x0apublic List extractUrls(String text, CrawlerUrl crawlerUrl) { \x0d\x0a Map urlMap = new HashMap(); \x0d\x0a extractHttpUrls(urlMap, text); \x0d\x0a extractRelativeUrls(urlMap, text, crawlerUrl); \x0d\x0a return new ArrayList(urlMap.keySet()); \x0d\x0a} \x0d\x0aprivate void extractHttpUrls(Map urlMap, String text) { \x0d\x0a Matcher m = (text); \x0d\x0a while (m.find()) { \x0d\x0a String url = m.group(); \x0d\x0a String[] terms = url.split("a href=\""); \x0d\x0a for (String term : terms) { \x0d\x0a // System.out.println("Term = " + term); \x0d\x0a if (term.startsWith("http")) { \x0d\x0a int index = term.indexOf("\""); \x0d\x0a if (index 0) { \x0d\x0a term = term.substring(0, index); \x0d\x0a } \x0d\x0a urlMap.put(term, term); \x0d\x0a System.out.println("Hyperlink: " + term); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a} \x0d\x0aprivate void extractRelativeUrls(Map urlMap, String text, \x0d\x0a CrawlerUrl crawlerUrl) { \x0d\x0a Matcher m = relativeRegexp.matcher(text); \x0d\x0a URL textURL = crawlerUrl.getURL(); \x0d\x0a String host = textURL.getHost(); \x0d\x0a while (m.find()) { \x0d\x0a String url = m.group(); \x0d\x0a String[] terms = url.split("a href=\""); \x0d\x0a for (String term : terms) { \x0d\x0a if (term.startsWith("/")) { \x0d\x0a int index = term.indexOf("\""); \x0d\x0a if (index 0) { \x0d\x0a term = term.substring(0, index); \x0d\x0a } \x0d\x0a String s = //" + host + term; \x0d\x0a urlMap.put(s, s); \x0d\x0a System.out.println("Relative url: " + s); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a \x0d\x0a}\x0d\x0apublic static void main(String[] args) { \x0d\x0a try { \x0d\x0a String url = ""; \x0d\x0a Queue urlQueue = new LinkedList(); \x0d\x0a String regexp = "java"; \x0d\x0a urlQueue.add(new CrawlerUrl(url, 0)); \x0d\x0a NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L, \x0d\x0a regexp); \x0d\x0a // boolean allowCrawl = crawler.areWeAllowedToVisit(url); \x0d\x0a // System.out.println("Allowed to crawl: " + url + " " + \x0d\x0a // allowCrawl); \x0d\x0a crawler.crawl(); \x0d\x0a } catch (Throwable t) { \x0d\x0a System.out.println(t.toString()); \x0d\x0a t.printStackTrace(); \x0d\x0a } \x0d\x0a}
以下是一个使用java实现的简单爬虫核心代码:
public void crawl() throws Throwable {
while (continueCrawling()) {
CrawlerUrl url = getNextUrl(); //获取待爬取队列中的下一个URL
if (url != null) {
printCrawlInfo();
String content = getContent(url); //获取URL的文本信息
//聚焦爬虫只爬取与主题内容相关的网页,这里采用正则匹配简单处理
if (isContentRelevant(content, this.regexpSearchPattern)) {
saveContent(url, content); //保存网页至本地
//获取网页内容中的链接,并放入待爬取队列中
Collection urlStrings = extractUrls(content, url);
addUrlsToUrlQueue(url, urlStrings);
} else {
System.out.println(url + " is not relevant ignoring ...");
}
//延时防止被对方屏蔽
Thread.sleep(this.delayBetweenUrls);
}
}
closeOutputStream();
}
private CrawlerUrl getNextUrl() throws Throwable {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = this.urlQueue.remove();
//doWeHavePermissionToVisit:是否有权限访问该URL,友好的爬虫会根据网站提供的"Robot.txt"中配置的规则进行爬取
//isUrlAlreadyVisited:URL是否访问过,大型的搜索引擎往往采用BloomFilter进行排重,这里简单使用HashMap
//isDepthAcceptable:是否达到指定的深度上限。爬虫一般采取广度优先的方式。一些网站会构建爬虫陷阱(自动生成一些无效链接使爬虫陷入死循环),采用深度限制加以避免
if (doWeHavePermissionToVisit(crawlerUrl)
(!isUrlAlreadyVisited(crawlerUrl))
isDepthAcceptable(crawlerUrl)) {
nextUrl = crawlerUrl;
// System.out.println("Next url to be visited is " + nextUrl);
}
}
return nextUrl;
}
private String getContent(CrawlerUrl url) throws Throwable {
//HttpClient4.1的调用与之前的方式不同
HttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url.getUrlString());
StringBuffer strBuf = new StringBuffer();
HttpResponse response = client.execute(httpGet);
if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {
HttpEntity entity = response.getEntity();
if (entity != null) {
BufferedReader reader = new BufferedReader(
new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
if (entity.getContentLength() 0) {
strBuf = new StringBuffer((int) entity.getContentLength());
while ((line = reader.readLine()) != null) {
strBuf.append(line);
}
}
}
if (entity != null) {
nsumeContent();
}
}
//将url标记为已访问
markUrlAsVisited(url);
return strBuf.toString();
}
public static boolean isContentRelevant(String content,
Pattern regexpPattern) {
boolean retValue = false;
if (content != null) {
//是否符合正则表达式的条件
Matcher m = regexpPattern.matcher(content.toLowerCase());
retValue = m.find();
}
return retValue;
}
public List extractUrls(String text, CrawlerUrl crawlerUrl) {
Map urlMap = new HashMap();
extractHttpUrls(urlMap, text);
extractRelativeUrls(urlMap, text, crawlerUrl);
return new ArrayList(urlMap.keySet());
}
private void extractHttpUrls(Map urlMap, String text) {
Matcher m = (text);
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
// System.out.println("Term = " + term);
if (term.startsWith("http")) {
int index = term.indexOf("\"");
if (index 0) {
term = term.substring(0, index);
}
urlMap.put(term, term);
System.out.println("Hyperlink: " + term);
}
}
}
}
private void extractRelativeUrls(Map urlMap, String text,
CrawlerUrl crawlerUrl) {
Matcher m = relativeRegexp.matcher(text);
URL textURL = crawlerUrl.getURL();
String host = textURL.getHost();
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
if (term.startsWith("/")) {
int index = term.indexOf("\"");
if (index 0) {
term = term.substring(0, index);
}
String s = //" + host + term;
urlMap.put(s, s);
System.out.println("Relative url: " + s);
}
}
}
}
public static void main(String[] args) {
try {
String url = "";
Queue urlQueue = new LinkedList();
String regexp = "java";
urlQueue.add(new CrawlerUrl(url, 0));
NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,
regexp);
// boolean allowCrawl = crawler.areWeAllowedToVisit(url);
// System.out.println("Allowed to crawl: " + url + " " +
// allowCrawl);
crawler.crawl();
} catch (Throwable t) {
System.out.println(t.toString());
t.printStackTrace();
}
}
codeblocks能不能写我不知道,但不仅仅只有java可以写爬虫,还有像python等的语言也可以写