我主要使用Jsoup解析,獲取源碼有時(shí)候使用Jsoup,比較復(fù)雜的時(shí)候比如需要換ip,改編碼或者模擬登陸的時(shí)候使用HttpClient,以下是抓取開源中國新聞的一段代碼,可以運(yùn)行。
創(chuàng)新互聯(lián)公司是專業(yè)的青田網(wǎng)站建設(shè)公司,青田接單;提供成都做網(wǎng)站、成都網(wǎng)站制作,網(wǎng)頁設(shè)計(jì),網(wǎng)站設(shè)計(jì),建網(wǎng)站,PHP網(wǎng)站建設(shè)等專業(yè)做網(wǎng)站服務(wù);采用PHP框架,可快速的進(jìn)行青田網(wǎng)站開發(fā)網(wǎng)頁制作和功能擴(kuò)展;專業(yè)做搜索引擎喜愛的網(wǎng)站,專業(yè)的做網(wǎng)站團(tuán)隊(duì),希望更多企業(yè)前來合作!
package demo;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* 使用JSoup 解析網(wǎng)頁,語法使用 JS,css,Jquery 選擇器語法,方便易懂
*
* Jsoup教程網(wǎng):jsoup開發(fā)指南,jsoup中文使用手冊,jsoup中文文檔
*
* @author geekfly
*
*/
public class JsoupDemo {
public static void main(String[] args) throws IOException {
String url = "新聞資訊 - 開源中國社區(qū)";
Document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0").get();
Elements elements = document.select("#RecentNewsList .List li");
for (Element element : elements) {
Elements titleElement = element.select("h2 a");
String title = titleElement.text();
String link = titleElement.attr("href").trim();
Elements dataElement = element.select(".date");
Elements autherElement = dataElement.select("a");
String auther = autherElement.text();
autherElement.remove();
String date = dataElement.text();
String detail = element.select(".detail").text();
System.out.println("鏈接: " + link);
System.out.println("標(biāo)題: " + title);
System.out.println("作者: " + auther);
System.out.println("發(fā)布時(shí)間: " + date);
System.out.println("詳細(xì)信息: " + detail);
System.out.println();
System.out.println();
}
System.out.println(elements.size());
}
}
以下是一個(gè)使用java實(shí)現(xiàn)的簡單爬蟲核心代碼:
public void crawl() throws Throwable {
while (continueCrawling()) {
CrawlerUrl url = getNextUrl(); //獲取待爬取隊(duì)列中的下一個(gè)URL
if (url != null) {
printCrawlInfo();
String content = getContent(url); //獲取URL的文本信息
//聚焦爬蟲只爬取與主題內(nèi)容相關(guān)的網(wǎng)頁,這里采用正則匹配簡單處理
if (isContentRelevant(content, this.regexpSearchPattern)) {
saveContent(url, content); //保存網(wǎng)頁至本地
//獲取網(wǎng)頁內(nèi)容中的鏈接,并放入待爬取隊(duì)列中
Collection urlStrings = extractUrls(content, url);
addUrlsToUrlQueue(url, urlStrings);
} else {
System.out.println(url + " is not relevant ignoring ...");
}
//延時(shí)防止被對方屏蔽
Thread.sleep(this.delayBetweenUrls);
}
}
closeOutputStream();
}
private CrawlerUrl getNextUrl() throws Throwable {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = this.urlQueue.remove();
//doWeHavePermissionToVisit:是否有權(quán)限訪問該URL,友好的爬蟲會(huì)根據(jù)網(wǎng)站提供的"Robot.txt"中配置的規(guī)則進(jìn)行爬取
//isUrlAlreadyVisited:URL是否訪問過,大型的搜索引擎往往采用BloomFilter進(jìn)行排重,這里簡單使用HashMap
//isDepthAcceptable:是否達(dá)到指定的深度上限。爬蟲一般采取廣度優(yōu)先的方式。一些網(wǎng)站會(huì)構(gòu)建爬蟲陷阱(自動(dòng)生成一些無效鏈接使爬蟲陷入死循環(huán)),采用深度限制加以避免
if (doWeHavePermissionToVisit(crawlerUrl)
(!isUrlAlreadyVisited(crawlerUrl))
isDepthAcceptable(crawlerUrl)) {
nextUrl = crawlerUrl;
// System.out.println("Next url to be visited is " + nextUrl);
}
}
return nextUrl;
}
private String getContent(CrawlerUrl url) throws Throwable {
//HttpClient4.1的調(diào)用與之前的方式不同
HttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url.getUrlString());
StringBuffer strBuf = new StringBuffer();
HttpResponse response = client.execute(httpGet);
if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {
HttpEntity entity = response.getEntity();
if (entity != null) {
BufferedReader reader = new BufferedReader(
new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
if (entity.getContentLength() 0) {
strBuf = new StringBuffer((int) entity.getContentLength());
while ((line = reader.readLine()) != null) {
strBuf.append(line);
}
}
}
if (entity != null) {
nsumeContent();
}
}
//將url標(biāo)記為已訪問
markUrlAsVisited(url);
return strBuf.toString();
}
public static boolean isContentRelevant(String content,
Pattern regexpPattern) {
boolean retValue = false;
if (content != null) {
//是否符合正則表達(dá)式的條件
Matcher m = regexpPattern.matcher(content.toLowerCase());
retValue = m.find();
}
return retValue;
}
public List extractUrls(String text, CrawlerUrl crawlerUrl) {
Map urlMap = new HashMap();
extractHttpUrls(urlMap, text);
extractRelativeUrls(urlMap, text, crawlerUrl);
return new ArrayList(urlMap.keySet());
}
private void extractHttpUrls(Map urlMap, String text) {
Matcher m = (text);
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
// System.out.println("Term = " + term);
if (term.startsWith("http")) {
int index = term.indexOf("\"");
if (index 0) {
term = term.substring(0, index);
}
urlMap.put(term, term);
System.out.println("Hyperlink: " + term);
}
}
}
}
private void extractRelativeUrls(Map urlMap, String text,
CrawlerUrl crawlerUrl) {
Matcher m = relativeRegexp.matcher(text);
URL textURL = crawlerUrl.getURL();
String host = textURL.getHost();
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
if (term.startsWith("/")) {
int index = term.indexOf("\"");
if (index 0) {
term = term.substring(0, index);
}
String s = //" + host + term;
urlMap.put(s, s);
System.out.println("Relative url: " + s);
}
}
}
}
public static void main(String[] args) {
try {
String url = "";
Queue urlQueue = new LinkedList();
String regexp = "java";
urlQueue.add(new CrawlerUrl(url, 0));
NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,
regexp);
// boolean allowCrawl = crawler.areWeAllowedToVisit(url);
// System.out.println("Allowed to crawl: " + url + " " +
// allowCrawl);
crawler.crawl();
} catch (Throwable t) {
System.out.println(t.toString());
t.printStackTrace();
}
}
網(wǎng)絡(luò)爬蟲是一個(gè)自動(dòng)提取網(wǎng)頁的程序,它為搜索引擎從萬維網(wǎng)上下載網(wǎng)頁,是搜索引擎的重要組成。\x0d\x0a傳統(tǒng)爬蟲從一個(gè)或若干初始網(wǎng)頁的URL開始,獲得初始網(wǎng)頁上的URL,在抓取網(wǎng)頁的過程中,不斷從當(dāng)前頁面上抽取新的URL放入隊(duì)列,直到滿足系統(tǒng)的一定停止條件。對于垂直搜索來說,聚焦爬蟲,即有針對性地爬取特定主題網(wǎng)頁的爬蟲,更為適合。\x0d\x0a\x0d\x0a以下是一個(gè)使用java實(shí)現(xiàn)的簡單爬蟲核心代碼:\x0d\x0apublic void crawl() throws Throwable { \x0d\x0a while (continueCrawling()) { \x0d\x0a CrawlerUrl url = getNextUrl(); //獲取待爬取隊(duì)列中的下一個(gè)URL \x0d\x0a if (url != null) { \x0d\x0a printCrawlInfo(); \x0d\x0a String content = getContent(url); //獲取URL的文本信息 \x0d\x0a \x0d\x0a //聚焦爬蟲只爬取與主題內(nèi)容相關(guān)的網(wǎng)頁,這里采用正則匹配簡單處理 \x0d\x0a if (isContentRelevant(content, this.regexpSearchPattern)) { \x0d\x0a saveContent(url, content); //保存網(wǎng)頁至本地 \x0d\x0a \x0d\x0a //獲取網(wǎng)頁內(nèi)容中的鏈接,并放入待爬取隊(duì)列中 \x0d\x0a Collection urlStrings = extractUrls(content, url); \x0d\x0a addUrlsToUrlQueue(url, urlStrings); \x0d\x0a } else { \x0d\x0a System.out.println(url + " is not relevant ignoring ..."); \x0d\x0a } \x0d\x0a \x0d\x0a //延時(shí)防止被對方屏蔽 \x0d\x0a Thread.sleep(this.delayBetweenUrls); \x0d\x0a } \x0d\x0a } \x0d\x0a closeOutputStream(); \x0d\x0a}\x0d\x0aprivate CrawlerUrl getNextUrl() throws Throwable { \x0d\x0a CrawlerUrl nextUrl = null; \x0d\x0a while ((nextUrl == null) (!urlQueue.isEmpty())) { \x0d\x0a CrawlerUrl crawlerUrl = this.urlQueue.remove(); \x0d\x0a //doWeHavePermissionToVisit:是否有權(quán)限訪問該URL,友好的爬蟲會(huì)根據(jù)網(wǎng)站提供的"Robot.txt"中配置的規(guī)則進(jìn)行爬取 \x0d\x0a //isUrlAlreadyVisited:URL是否訪問過,大型的搜索引擎往往采用BloomFilter進(jìn)行排重,這里簡單使用HashMap \x0d\x0a //isDepthAcceptable:是否達(dá)到指定的深度上限。爬蟲一般采取廣度優(yōu)先的方式。一些網(wǎng)站會(huì)構(gòu)建爬蟲陷阱(自動(dòng)生成一些無效鏈接使爬蟲陷入死循環(huán)),采用深度限制加以避免 \x0d\x0a if (doWeHavePermissionToVisit(crawlerUrl) \x0d\x0a (!isUrlAlreadyVisited(crawlerUrl)) \x0d\x0a isDepthAcceptable(crawlerUrl)) { \x0d\x0a nextUrl = crawlerUrl; \x0d\x0a // System.out.println("Next url to be visited is " + nextUrl); \x0d\x0a } \x0d\x0a } \x0d\x0a return nextUrl; \x0d\x0a}\x0d\x0aprivate String getContent(CrawlerUrl url) throws Throwable { \x0d\x0a //HttpClient4.1的調(diào)用與之前的方式不同 \x0d\x0a HttpClient client = new DefaultHttpClient(); \x0d\x0a HttpGet httpGet = new HttpGet(url.getUrlString()); \x0d\x0a StringBuffer strBuf = new StringBuffer(); \x0d\x0a HttpResponse response = client.execute(httpGet); \x0d\x0a if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) { \x0d\x0a HttpEntity entity = response.getEntity(); \x0d\x0a if (entity != null) { \x0d\x0a BufferedReader reader = new BufferedReader( \x0d\x0a new InputStreamReader(entity.getContent(), "UTF-8")); \x0d\x0a String line = null; \x0d\x0a if (entity.getContentLength() 0) { \x0d\x0a strBuf = new StringBuffer((int) entity.getContentLength()); \x0d\x0a while ((line = reader.readLine()) != null) { \x0d\x0a strBuf.append(line); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a if (entity != null) { \x0d\x0a nsumeContent(); \x0d\x0a } \x0d\x0a } \x0d\x0a //將url標(biāo)記為已訪問 \x0d\x0a markUrlAsVisited(url); \x0d\x0a return strBuf.toString(); \x0d\x0a}\x0d\x0apublic static boolean isContentRelevant(String content, \x0d\x0aPattern regexpPattern) { \x0d\x0a boolean retValue = false; \x0d\x0a if (content != null) { \x0d\x0a //是否符合正則表達(dá)式的條件 \x0d\x0a Matcher m = regexpPattern.matcher(content.toLowerCase()); \x0d\x0a retValue = m.find(); \x0d\x0a } \x0d\x0a return retValue; \x0d\x0a}\x0d\x0apublic List extractUrls(String text, CrawlerUrl crawlerUrl) { \x0d\x0a Map urlMap = new HashMap(); \x0d\x0a extractHttpUrls(urlMap, text); \x0d\x0a extractRelativeUrls(urlMap, text, crawlerUrl); \x0d\x0a return new ArrayList(urlMap.keySet()); \x0d\x0a} \x0d\x0aprivate void extractHttpUrls(Map urlMap, String text) { \x0d\x0a Matcher m = (text); \x0d\x0a while (m.find()) { \x0d\x0a String url = m.group(); \x0d\x0a String[] terms = url.split("a href=\""); \x0d\x0a for (String term : terms) { \x0d\x0a // System.out.println("Term = " + term); \x0d\x0a if (term.startsWith("http")) { \x0d\x0a int index = term.indexOf("\""); \x0d\x0a if (index 0) { \x0d\x0a term = term.substring(0, index); \x0d\x0a } \x0d\x0a urlMap.put(term, term); \x0d\x0a System.out.println("Hyperlink: " + term); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a} \x0d\x0aprivate void extractRelativeUrls(Map urlMap, String text, \x0d\x0a CrawlerUrl crawlerUrl) { \x0d\x0a Matcher m = relativeRegexp.matcher(text); \x0d\x0a URL textURL = crawlerUrl.getURL(); \x0d\x0a String host = textURL.getHost(); \x0d\x0a while (m.find()) { \x0d\x0a String url = m.group(); \x0d\x0a String[] terms = url.split("a href=\""); \x0d\x0a for (String term : terms) { \x0d\x0a if (term.startsWith("/")) { \x0d\x0a int index = term.indexOf("\""); \x0d\x0a if (index 0) { \x0d\x0a term = term.substring(0, index); \x0d\x0a } \x0d\x0a String s = //" + host + term; \x0d\x0a urlMap.put(s, s); \x0d\x0a System.out.println("Relative url: " + s); \x0d\x0a } \x0d\x0a } \x0d\x0a } \x0d\x0a \x0d\x0a}\x0d\x0apublic static void main(String[] args) { \x0d\x0a try { \x0d\x0a String url = ""; \x0d\x0a Queue urlQueue = new LinkedList(); \x0d\x0a String regexp = "java"; \x0d\x0a urlQueue.add(new CrawlerUrl(url, 0)); \x0d\x0a NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L, \x0d\x0a regexp); \x0d\x0a // boolean allowCrawl = crawler.areWeAllowedToVisit(url); \x0d\x0a // System.out.println("Allowed to crawl: " + url + " " + \x0d\x0a // allowCrawl); \x0d\x0a crawler.crawl(); \x0d\x0a } catch (Throwable t) { \x0d\x0a System.out.println(t.toString()); \x0d\x0a t.printStackTrace(); \x0d\x0a } \x0d\x0a}