使用URLConnection爬取评论

news/2025/1/13 6:29:05/

本文将示例如何使用 SpiderHttpUtils 来爬取某知名*猫平台的评论信息。

以 https://detail.tmall.com/item.htm?id=18539499729 宝贝为例,使用Fiddler抓包工具获取到它的评论请求地址如下,其中的 currentPage 参数即为被爬取的评论的页码。

 pom.xml 文件中引入依赖包:

	<parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>1.5.7.RELEASE</version><relativePath /></parent><dependencies><dependency><groupId>org.apache.commons</groupId><artifactId>commons-text</artifactId><version>1.6</version></dependency><dependency><groupId>org.apdplat</groupId><artifactId>word</artifactId><version>1.3</version></dependency><dependency><groupId>org.json</groupId><artifactId>json</artifactId></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><scope>test</scope></dependency></dependencies>

 爬取评论的完整代码如下:

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;import org.apache.commons.text.StringEscapeUtils;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;
import org.json.JSONArray;
import org.json.JSONObject;
import org.junit.Test;import spider.SpiderHttpUtils;public class SpiderTest {public Map<String, String> getHeaders() {Map<String, String> headers = new HashMap<String, String>();headers.put("Host", "rate.tmall.com");headers.put("Referer"," https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.50be3bd8ewlaTd&id=41504319950&user_id=1975415428&cat_id=2");// headers.put("Accept-Language",// "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7");headers.put("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");headers.put("Cookie","cna=LuKHFKl4TlECAXQZ4Ux2g/Nd; cookie2=1d8425e75fcbd3cdaa40611db6680374; t=17fe97a643f4e1510f9e2977f9cbdd7d; _tb_token_=5734e153a5d34; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; dnk=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; hng=CN%7Czh-CN%7CCNY%7C156; sn=%E5%85%A8%E6%A3%89%E6%97%B6%E4%BB%A3%E5%AE%98%E6%96%B9%E6%97%97%E8%88%B0%E5%BA%97%3Azfx; tk_trace=1; tracknick=pengjun%5Cu674E;lgc=pengjun%5Cu674E; enc=0F%2FkiNyKc%2F1vIUcjp6C7VI6tjD6K9gSaTtAQPlmY8CraZFMzXZMEcgDnr0LKd0SvSeKPrUQAqqEU%2Bq0O3aXG4Q%3D%3D; SHTSID=F8504BDA308C40A1867B84AA984C7914; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=U%2BGCWk%2F7pY%2FF&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTZ5OSpoR6Xcg%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByEze4ekEsQsgc1A%3D&id2=VWeT3jqq6jDz&nk2=E6EQ1CLKS%2FnL&lg2=VT5L2FSpMGV7TQ%3D%3D; _l_g_=Ug%3D%3D; unb=682167773; cookie1=V3oTBcYJDILlbjtF3qOSEAd2Amf77M7oTu0rSZnkuIc%3D; login=true; cookie17=VWeT3jqq6jDz; _nk_=pengjun%5Cu674E;csg=1d4e91d8; skt=af3747a6827ebc42; _m_h5_tk=0ca05f482e46af75317d66b214d43689_1550465686263; _m_h5_tk_enc=2c51b947b84a5ef62f7c6523f04bbce9; x5sec=7b22726174656d616e616765723b32223a223762303932326363393666646437303062663361636430393164343932353530434b4731714f4d46454a335338706254376175634a686f4c4e6a67794d5459334e7a637a4f7a453d227d; whl=-1%260%260%260; l=bBN1mgHrvxpFLmphBOCwNQKXnqQTlIRRguSJGpWpi_5LUsvecl7OllzxWUv6Vj5P9zLB42mIJ0JTgFyQ5Ppf.; isg=BMPDJZxTm0CbSVClfvjWFCVzUofBKAVE096Au_WgcyLLtOHWfQtayJoiKgRfFK9y");return headers;}/*** 爬取评论内容*/@Testpublic void testSpider() throws IOException, InterruptedException {// 构建正则表达式对响应内容进行匹配过滤String regEx = "jsonp\\d+\\?\\(([\\s\\S]*)\\)";Pattern pat = Pattern.compile(regEx);// 请求地址String url = null;// 响应内容String retStr = null;Random random = new Random();// 用来保存评论到文件FileOutputStream fos = new FileOutputStream(new File("D:/简柔洁面巾_评论.txt"));OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");BufferedWriter bw = new BufferedWriter(osw);int lineCount = 0;// 逐页对评论进行爬取并写入文件for (int i = 1; i < 100; i++) {System.out.println("开始爬取第 " + i + " 页评论");url = MessageFormat.format(" https://rate.tmall.com/list_detail_rate.htm?itemId=41504319950&spuId=303661613&sellerId=1975415428&order=3&currentPage={0}&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvnpvRvphvUvCkvvvvvjiPRLspljEhn2qpsjthPmPyljDvRsLhtjrWPLsyAjn8RphvCvvvphmCvpvZ7Dl0eP5w7Di43kS5PbE4Bxi%2Fz1htvpvhvvCvpUwCvvpv9hCv2QhvCvvvMMGEvpCWvXfYMBlre8g7%2B3%2Bilj7Jyb8rwZDl%2BboJ%2BulABzcGeE9fV5EUAWAXeBOqb64B9Cka%2BfvsxI2heB6t%2BFBCAfyp%2Bu0OjomUy4oGULIKogyCvvOCvhE2zWoivpvUvvCC8Nrej68tvpvIvvCvpvvvvvvvvhOVvvvCw9vvB9OvvUHmvvCVC9vv9ogvvhOVvvmCb9hCvvOv9hCvvvvtvpvhvvCvp8wCvvpvvhHh9phv2HiwJSaQzHi475CnzT6Cvvyv9XRbIQvvD7w%3D&needFold=0&_ksTS=1550459693930_943&callback=jsonp944",i);retStr = SpiderHttpUtils.sendGet(true, url, null, getHeaders(), "utf-8");Thread.sleep(random.nextInt(4000) + 2000);Matcher mat = pat.matcher(retStr);if (mat.find()) {String jsonstr = mat.group(1);String finalJson = StringEscapeUtils.unescapeJava(jsonstr);try {JSONObject retJson = new JSONObject(finalJson);JSONObject rateDetail = retJson.getJSONObject("rateDetail");JSONArray rateList = rateDetail.getJSONArray("rateList");for (int index = 0; index < rateList.length(); index++) {JSONObject jsonObject = rateList.getJSONObject(index);bw.write(jsonObject.getString("rateContent"));lineCount += 1;bw.newLine();}} catch (Exception e) {e.printStackTrace();System.out.println(jsonstr);// break;}}}// 关闭文件流bw.close();System.out.println("共爬取 " + lineCount + " 行评论");}/*** 对爬取到的评论内容进行分词*/@Testpublic void testWord() throws IOException {FileReader reader = new FileReader("D:/简柔洁面巾_评论.txt");BufferedReader br = new BufferedReader(reader);String str = null;Map<String, IKWord> map = new HashMap<String, IKWord>();while ((str = br.readLine()) != null) {List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.BidirectionalMaximumMatching);for (Word word : words) {String text = word.getText();IKWord ikWord = map.get(text);if (map.containsKey(text)) {ikWord.addCount(1);} else {ikWord = new IKWord();ikWord.setWord(text);ikWord.setCount(1);map.put(text, ikWord);}}}br.close();reader.close();FileWriter writer = new FileWriter("D:/简柔洁面巾_分词.txt");BufferedWriter bw = new BufferedWriter(writer);List<IKWord> list = new ArrayList<IKWord>();list.addAll(map.values());Collections.sort(list);Iterator<IKWord> iterator = list.iterator();while (iterator.hasNext()) {IKWord next = iterator.next();bw.write(next.getWord() + " " + next.getCount());bw.newLine();}bw.close();writer.close();}
}
public class IKWord implements Comparable<IKWord> {private Integer count;private String word;@Overridepublic int compareTo(IKWord that) {return that.count - this.count;}public boolean equals(IKWord that) {return (this.word.equals(that.word));}public Integer getCount() {return count;}public void setCount(Integer count) {this.count = count;}public String getWord() {return word;}public void setWord(String word) {this.word = word;}public void addCount(Integer count) {this.count += count;}}

 


http://www.ppmy.cn/news/169488.html

相关文章

高光谱图像分类

文章目录 论文笔记摘要介绍高光谱图像 HSI2-D-CNN3-D-CNN充分利用2-D和3-DCNN的自动特征学习能力 实验和分析公平比较分类结果的指标混合神经网络模型 结论 代码实现思考题3D卷积和2D卷积的区别多测试几次&#xff0c;训练网络会发现每次分类的结果都不一样的原因进一步提升高光…

输出“A、B...Z、AA、AB...AZ、BA、BB...BZ.......”的结构

最近有个内容需要按照如下方式开始编号&#xff0c;开头从“A”开始&#xff0c;然后依次一个大写字母一直到“Z”&#xff0c;然后再往后逢“Z”进位&#xff0c;即从“AA”继续&#xff0c; 下面的例子是从“A”一直输出到“AZZ”; 代码如下 /****************************…

北京54、西安80、CGCS2000、WGS84坐标系及区别

1. 简介 现在我国已经停用北京54和西安80坐标系&#xff0c;改用CGCS2000坐标系了。北京54和西安80是参心坐标系&#xff0c;CGCS2000是地心坐标系与WGS84一样&#xff0c;只是椭球扁率微小差异&#xff0c;实际坐标结果几乎一致&#xff0c;约有毫米级别的差异 。CGCS2000和W…

微信支付v3 AEAD_AES_256_GCM解密JS版本代码及验证 javascript

最近因为在写微信支付相关的代码&#xff0c;所以不可避免的涉及到加密解密的问题。而很多js的许多加密解密算法需要自行寻找&#xff0c;我也没有在网上找到一篇针对微信支付这个问题的综合类博客&#xff0c;所以在这里叙述一下我自己关于AEAD_AES_256_GCM解密的一个JS解决方…

Pentest Wiki Part4 后渗透(二)

Hacking Windows Active Directory 目录 1. Description 2. Workthrough 2.1 10.1.222.2032.1.1 Wordpress - Code Injection 2.2 10.1.222.2002.2.1 Port Scanning2.2.2 XP_CMDSHELL 2.3 10.1.222.2012.3.1 MS14-068 2.4 10.1.222.202 描述 Descriptionread Flag from C:\…

GPS经纬度坐标WGS84到东北天坐标系ENU的转换

文章目录 一、简介1.1 ECEF坐标系1.2 WGS-84坐标1.3 东北天坐标系&#xff08;ENU&#xff09; 二、坐标系间的转换2.1 LLA坐标系转ECEF坐标系2.2 ECEF坐标系转LLA坐标系2.3 ECEF坐标系转ENU坐标系2.4 ENU坐标系转ECEF坐标系2.5 LLA坐标系直接转ENU坐标系 参考资料打赏支付宝微…

【嵌入式】MCU外接Flash图片数据存取实例

一 问题背景 工程中需要使用大量的图片资源用于GUI显示&#xff0c;但是图片资源比较大&#xff0c;不能直接保存在MCU上&#xff0c;所以考虑外接Flash芯片用于图片数据的存储。实际使用中&#xff0c;将Flash芯片内的地址直接映射到芯片内部&#xff0c;读取映射的地址即可加…

Activiti工作流引擎

一、 Activiti概述&#xff1a; 前言&#xff1a; 系统的核心根本上是业务流程&#xff0c;工作流只是协助进行业务流程管理。 在没有使用工作流引擎时&#xff0c;可以采用状态字段来跟踪流程的变化情况&#xff0c;这样不同角色的用户&#xff0c;通过状态字段的取值来决定记…