和之前爬取天气网站一样,现在用webmagic爬取中关村在线华为手机的评论。(http://detail.zol.com.cn/405/404275/review.shtml)
之前的天气网站由数据是静态的,解析时很容易就能获取,这次的评论数据不一样,是js动态加载的。
f12打开开发人员工具,点击第二页时我们可以在network中看到一条xhr请求
可以发现请求的规律,一款手机对应一个proid,epage为页码,因此我们可以构造请求,模拟浏览器发送来获得每一页数据。
完整代码如下:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;public class HuaweiRepoPageProcessor implements PageProcessor {// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);/** ascii码转汉字*/private static String ascii2native(String asciicode){String[] asciis = asciicode.split ("\\\\u");String nativeValue = asciis[0];try{for ( int i = 1; i < asciis.length; i++ ){String code = asciis[i];nativeValue += (char) Integer.parseInt (code.substring (0, 4), 16);if (code.length () > 4){nativeValue += code.substring (4, code.length ());}}}catch (NumberFormatException e){return asciicode;}return nativeValue;}/** 获取Ajax请求*/public static String getAjax(int arg,int pagei){return "http://detail.zol.com.cn/xhr3_Review_GetListAndPage_isFilter=0%5EproId="+arg+"%5Epage="+pagei+".html";}public static String getType(int arg){return "/"+(arg/1000+1);}public static int getMobileArg(String url){Pattern p = Pattern.compile("[\\d]+"); Matcher m = p.matcher(url); m.find();m.group();m.find();return Integer.parseInt(m.group());}/** 获取下一页详情页链接*/public static List<String> getNext(String nextUrl,String type){List<String> res = new ArrayList<String>();URL url;try {url = new URL(nextUrl);HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.connect(); InputStream urlStream = connection.getInputStream();BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream)); //str就是页面代码,用split函数和正则表达式分割strString str=reader.readLine(); while(true){int l = str.indexOf(type);if(l==-1)break;int r = str.indexOf("tagNav");String nexturl = str.substring(l, r+6).replace("\\", "");//System.out.println(nexturl);res.add("http://detail.zol.com.cn"+nexturl);str = str.substring(r+6);}} catch (MalformedURLException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} return res;}/** 获取回复的Ajax请求*/public static String getReply(String url,int pagei){Pattern p = Pattern.compile("[\\d]+"); Matcher m = p.matcher(url); List<String> num = new ArrayList<String>();int numi = 0;while(m.find()){ //System.out.println(m.group());num.add(m.group()); } String nu [] = {"",""};int i=0;for(String s : num){if(s.length()>5){nu[i++] = s;} }String res = "http://detail.zol.com.cn/xhr3_Review_GetReplyPart_reviewId="+nu[1]+"%5EsubcateId=57%5EproId="+nu[0]+"%5EisReviewDetail=1%5EsubPageType=Review%5Epage="+pagei+".html";//System.out.println(res);//System.out.println("****************");return res;}/** 根据Ajax请求得到Document*/public static Document getReplyDoc(String docUrl){URL url;Document docList = null;try {url = new URL(docUrl);HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.connect(); InputStream urlStream = connection.getInputStream();BufferedReader reader = new BufferedReader(new InputStreamReader(urlStream)); //str就是页面代码String str=reader.readLine(); str = ascii2native(str);//System.out.println(str);String s = str.replace("\\", "");//System.out.println(s);docList = Jsoup.parse(s);} catch (MalformedURLException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} return docList;}public static void print(HuaweiMobile review){System.out.println("型号:"+(review.getXinghao()==null?"":review.getXinghao()));System.out.println("时间:"+(review.getTime()==null?"":review.getTime()));System.out.println("地点:"+(review.getPlace()==null?"":review.getPlace()));System.out.println("标题:"+review.getTitle());System.out.println("优点:"+review.getYoudian());System.out.println("缺点:"+review.getQuedian());System.out.println("总结:"+review.getZongjie());System.out.println("内容:"+review.getContent());for(Entry<String,String> map : review.getReply().entrySet()){System.out.println(map.getKey()+"\t"+map.getValue());}System.out.println("****************************");}/** 解析页面*/public void process(Page page) {// 部分二:定义如何抽取页面信息,并保存下来//System.out.println(my++);String url = page.getUrl().toString();//System.out.println(page.);//System.out.println(url);if(url.endsWith("tagNav")){HuaweiMobile review = new HuaweiMobile();Html html = page.getHtml();String xinghao = Jsoup.parse(html.xpath("/html/body/div[3]/div[3]/a[4]").toString()).getElementsByTag("a").get(0).text();review.setXinghao(xinghao);String time = Jsoup.parse(html.xpath("//*[@id=\"J_CommentContent\"]/div[2]/h3/span").toString()).getElementsByTag("span").get(0).text();review.setTime(time);//System.out.println(xinghao);//System.out.println(time);String content = html.toString();Document docList = Jsoup.parse(content);Elements pro = docList.getElementsByClass("product-parameter");if(pro.size()>0){Elements LiTag = pro.get(0).getElementsByTag("li");for(int i = 0;i<LiTag.size();i++){String text = LiTag.get(i).text();//System.out.println(LiTag.get(i).text());String span = LiTag.get(i).getElementsByTag("span").get(0).text();if(span.matches(".*型号.*")){review.setXinghao(LiTag.get(i).text().replaceAll("产品型号:", ""));}else if(span.matches(".*时间.*")){review.setPlace(LiTag.get(i).text().replaceAll("[\\d-()时间地点:]", ""));review.setTime(LiTag.get(i).text().replaceAll("[^\\d-]", ""));}}}String comcontent = null;Elements comtit = docList.getElementsByClass("comments-content");if(comtit.size()>0){if(comtit.get(0).getElementsByTag("h3").size()>0){String tit = comtit.get(0).getElementsByTag("h3").get(0).text();comcontent = tit.replaceAll("[\\d-]", "");review.setTitle(comcontent);} }Elements com = docList.getElementsByClass("comments-words");for(int i = 0;i<com.size();i++){Elements strongs = com.get(i).getElementsByTag("strong");String strong = strongs.get(0).html();//System.out.println(strongs.get(0).html());Elements ps = com.get(i).getElementsByTag("p");Elements spans = ps.get(0).getElementsByTag("span");//System.out.println(spans.get(0).html());if(strong.matches(".*优点.*")){review.setYoudian(spans.get(0).html());comcontent += "\n优点:\n"+spans.get(0).html();}else if(strong.matches(".*缺点.*")){review.setQuedian(spans.get(0).html());comcontent += "\n缺点:\n"+spans.get(0).html();}else if(strong.matches(".*总结.*")){review.setZongjie(spans.get(0).html());comcontent += "\n总结:\n"+spans.get(0).html();}}review.setContent(comcontent);//System.out.println(getReply(url, 1));int pagei = 1;while(true){Document doc = getReplyDoc(getReply(url, pagei));Elements ereply = doc.getElementsByClass("reply-item");//System.out.println(ereply.size());if(ereply.size()==0)break;for(int i = 0 ;i<ereply.size();i++){review.addReply(ereply.get(i).getElementsByTag("em").get(0).text(), ereply.get(i).getElementsByTag("p").get(0).text());}pagei++;}print(review);}else if(url.matches("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml")){int mobileArg = getMobileArg(url);int pagei = 1;while(true){List<String> s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));if(s.size()==0)break;page.addTargetRequests(s); pagei++;}}// 部分三:从页面发现后续的url地址来抓取page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/cell_phone/index[\\d]+.shtml").all());page.addTargetRequests(page.getHtml().links().regex("http://detail.zol.com.cn/[\\d]+/[\\d]+/review.shtml").all());//page.addTargetRequests(page.getHtml().links().regex("http://detail\\.zol\\.com\\.cn/index\\.php?c=AjaxVer3_Review&a=GetListAndPage&isFilter=0&proId=386269&page=[\\d]+").all());}/** 获取链接,启动爬虫*/public static void huaweiSpider(int mobileArg){Spider spider = Spider.create(new HuaweiRepoPageProcessor());int pagei = 1;while(true){List<String> s = getNext(getAjax(mobileArg, pagei),getType(mobileArg));if(s.size()==0)break;for(String a:s){//System.out.println(a);spider.addUrl(a); }pagei++;}//Spider.create(new HuaweiRepoPageProcessor()).addUrls(ss).thread(5).run();spider.thread(5).run();}public Site getSite() {return site;}public static void main(String[] args) {//huaweiSpider(395493);Spider.create(new HuaweiRepoPageProcessor()).addUrl("http://detail.zol.com.cn/cell_phone_index/subcate57_613_list_1.html").thread(5).run();}
}