要么不做,要做就做完
此项目目录结构
项目分析
环境准备
开发准备
数据库表
添加依赖
添加配置文件
编写pojo
编写dao
编写service接口
service实现类
StringBoot引导类
封装Htmlunit
实现数据抓取
成功展示
这前段时间有一件事“格力举报奥克斯空调质量",我看了一下京东这两家店铺,感觉很有意思,看着就觉得奥克斯空调选购指数高很多。所以,就尝试爬一下看看,练手小demo。
jd页面数据绝大多数是通过Ajax请求获取的,我用浏览器调试工具(F12),发现这些Ajax很复杂,多层调用,并且关键数据做了些混淆,就是直接去请求Ajax链接返回的数据还需要通过特定JS处理,得到原有数据。一直被卡住了,最后通过一个HttpUnit(带JS解析器,可以爬取动态页面)。
项目分析
主要是想爬格力和奥克斯 空调的各型号的选购指数,顺带把商品标题、价格、评论人数、店铺、选购指数等都爬了一遍,
由于一个系列,有多个型号(大小匹数),但这个系列的选购指数是差别不大了,就不用都爬了。
环境准备
jdk1.8
maven
mysql
SpringBoot
开发准备
数据库表
根据分析和结合实际,我们创建如下表
CREATE TABLE `jd_item` (`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',`title` varchar(100) DEFAULT NULL COMMENT '商品标题',`price` double DEFAULT NULL COMMENT '商品价格',`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',`created` datetime DEFAULT NULL COMMENT '创建时间',`updated` datetime DEFAULT NULL COMMENT '更新时间',`comment` double DEFAULT NULL COMMENT '评价人数',`score` double DEFAULT NULL COMMENT '选购指数',`shop` varchar(100) DEFAULT NULL COMMENT '选购店铺',PRIMARY KEY (`id`),KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表';
添加依赖
使用Spring Boot+Spring Data JPA和定时任务进行开发,HtmlUnit获取动态网页,Jsoup解析页面。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.0.2.RELEASE</version></parent><groupId>cn.itcast</groupId><artifactId>itcast-crawler-jd</artifactId><version>1.0-SNAPSHOT</version><dependencies><!--SpringMVC--><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><!--SpringData Jpa--><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-jpa</artifactId></dependency><!--MySQL连接包--><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId></dependency><!-- HttpClient --><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId></dependency><!--HtmlUnit--><dependency><groupId>net.sourceforge.htmlunit</groupId><artifactId>htmlunit</artifactId><version>2.32</version></dependency><!--Jsoup--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.10.3</version></dependency><!--工具包--><dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId></dependency></dependencies></project>
添加配置文件
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler
spring.datasource.username=root
spring.datasource.password=root#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
编写pojo
package cn.itcast.jd.pojo;import javax.persistence.*;
import java.util.Date;@Entity
@Table(name = "jd_item")
public class Itempojo {//主键@Id@GeneratedValue(strategy = GenerationType.IDENTITY)private Long id;//标准产品单位(商品集合)/#plist > ul > li:nth-child(26) > divprivate Long spu;//库存量单位(最小品类单元)/#plist > ul > li:nth-child(26) > divprivate Long sku;//商品标题/#plist > ul > li:nth-child(26) > div > div.p-name > a > emprivate String title;//商品价格/#plist > ul > li:nth-child(26) > div > div.p-price > strong:nth-child(1) > iprivate Double price;//商品详情地址/#plist > ul > li:nth-child(26) > div > div.p-name > aprivate String url;//创建时间private Date created;//更新时间private Date updated;//评价人数/#plist > ul > li:nth-child(26) > div > div.p-commit > strong > aprivate Double comment;//选购指数/#plist > ul > li:nth-child(26) > div > div.p-commit > strong > aprivate Double score;//选购店铺/#plist > ul > li:nth-child(26) > div > div.p-shop > span > aprivate String shop;public Long getId() {return id;}public void setId(Long id) {this.id = id;}public Long getSpu() {return spu;}public void setSpu(Long spu) {this.spu = spu;}public Long getSku() {return sku;}public void setSku(Long sku) {this.sku = sku;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public Double getPrice() {return price;}public void setPrice(Double price) {this.price = price;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public Date getCreated() {return created;}public void setCreated(Date created) {this.created = created;}public Date getUpdated() {return updated;}public void setUpdated(Date updated) {this.updated = updated;}public Double getComment() {return comment;}public void setComment(Double comment) {this.comment = comment;}public Double getScore() {return score;}public void setScore(Double score) {this.score = score;}public String getShop() {return shop;}public void setShop(String shop) {this.shop = shop;}
}
编写dao
public interface ItemDao extends JpaRepository<Itempojo,Long> {
}
编写service接口
public interface ItemService {public void save(Itempojo itempojo);public List<Itempojo> findAll(Itempojo itempojo);
}
service实现类
@Service
public class ItemServiceImpl implements ItemService {@Autowiredprivate ItemDao itemDao;@Override@Transactionalpublic void save(Itempojo itempojo) {this.itemDao.save(itempojo);}@Overridepublic List<Itempojo> findAll(Itempojo itempojo) {//声明查询条件Example<Itempojo> example = Example.of(itempojo);//根据查询条件进行查询数据List<Itempojo> list = this.itemDao.findAll(example);return list;}
}
StringBoot引导类
@SpringBootApplication
//使用定时任务,需要先开启定时任务,需要添加注解
@EnableScheduling
public class Application {public static void main(String[] args) {SpringApplication.run(Application.class, args);}
}
封装Htmlunit
@Component
public class Httputils {public Httputils() {}public String doGetHtml(String str) {final WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要webClient.getOptions().setActiveXNative(false);webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JSwebClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAXwebClient.waitForBackgroundJavaScript(15000);//异步JS执行需要耗时,所以这里线程要阻塞15秒,等待异步JS执行结束
//cookie删除了HtmlPage page = null;try {page = webClient.getPage(str);//尝试加载网页} catch (Exception e) {e.printStackTrace();} finally {webClient.close();}String pageXml = page.asXml();//直接将加载完成的页面转换成xml格式的字符串return pageXml;}}
实现数据抓取
在一个类完成了,本来应该分开的,模块发开发嘛,但是,这个小Demo只是练手的,就和一起呗
@Component
public class ItemTask {@Autowiredprivate Httputils httputils;@Autowiredprivate ItemService itemService;private static final ObjectMapper MAPPER = new ObjectMapper();//当下载任务完成后,间隔多长时间进行下一次的任务。@Scheduled(fixedDelay = 11 * 2500)public void itemTask() throws Exception {//声明需要解析的初始地址
// String url = "https://list.jd.com/list.html?cat=737,794,870&ev=exbrand_7420&page=";//格力String url = "https://list.jd.com/list.html?cat=737,794,870&ev=exbrand_3659&page=";//奥克斯//按照页面对手机的搜索结果进行遍历解析for (int i = 1; i < 6; i++) {Thread.sleep(5000);String html = httputils.doGetHtml(url + i);System.out.println("测试" + i);
// System.out.println(html);//解析页面,获取商品数据并存储this.parse(html);}System.out.println("手机数据抓取完成!");}//解析页面,获取商品数据并存储private void parse(String html) throws Exception {//解析html获取DocumentDocument doc = Jsoup.parse(html);//获取spu信息/Elements spuEles = doc.select("li.gl-item > div");for (Element spuEle : spuEles) {//获取spulong spu = Long.parseLong(spuEle.attr("data-sku_temp"));//获取skulong sku = Long.parseLong(spuEle.attr("data-active-sku"));Itempojo item = new Itempojo();item.setSku(sku);//在数据库中查询商品数据,感觉并不需要List<Itempojo> list = this.itemService.findAll(item);if (list.size() > 0) {//如果商品存在,就进行下一个循环,该商品不保存,因为已存在continue;}//设置商品的spuitem.setSpu(spu);//获取商品的详情的urlString itemUrl = "https://item.jd.com/" + sku + ".html";item.setUrl(itemUrl);//获取商品的标题#plist > ul > li:nth-child(26) > div > div.p-name > a > emString title = spuEle.select("div.p-name > a > em").text();item.setTitle(title);//获取商品的价格#plist > ul > li > div > div.p-price > strong:nth-child(1) > i,这里会出现一个”暂无报价“String pricetext = spuEle.select("div.p-price > strong > i").first().text();if (pricetext == null || pricetext.length() <= 0 || pricetext.equals("暂无报价")) {System.out.println("数据异常");item.setPrice(0.0);} else {Double price = Double.parseDouble(pricetext);item.setPrice(price);}//爬取时间item.setCreated(new Date());item.setUpdated(item.getCreated());//评价人数/#plist > ul > li> div > div.p-commit > strong > aString cm = spuEle.select("div.p-commit > strong > a").text();if (cm == null || cm.length() <= 1) {System.out.println("数据异常");item.setComment(0.0);} else {String substring = cm.substring(cm.length() - 2, cm.length() - 1);if (substring.equals("万")) {item.setComment(Double.parseDouble(cm.substring(0, cm.length() - 2)) * 10000);} else {item.setComment(Double.parseDouble(cm.substring(0, cm.length() - 1)));}}//选购指数/li > div > div.p-commit > span > emString scoretest = spuEle.select("div.p-commit > span > em").text();if (scoretest == null || scoretest.length() <= 0) {System.out.println("数据异常");item.setScore(0.0);} else {item.setScore(Double.parseDouble(scoretest));}//选购店铺/#plist > ul > li:nth-child(26) > div > div.p-shop > span > aString shop = spuEle.select("div.p-shop > span > a").text();item.setShop(shop);//检测是否存在异常数据,并处理Itempojo itempojo = this.check(item);//保存商品数据到数据库中this.itemService.save(itempojo);}}public Itempojo check(Itempojo item) {if (item.getComment() == 0.0 || item.getScore() == 0.0) {//解析页面String page = httputils.doGetHtml(item.getUrl());Document document = Jsoup.parse(page);//取得评论数String cm = document.select("#comment-count > a").text();if (cm.length() <= 0 || cm == null) {} else {item.setComment(Double.parseDouble(cm));}//取得选购指数String score = document.select("#buy-rate > a").text();if (cm.length() <= 0 || cm == null) {} else {item.setScore(Double.parseDouble(score));}}return item;}
}
(此部分代码注释很多删除了,如需下载源码,请访问!!)
成功展示
这篇文章太长了,就单单讲代码吧,关于数据、及数据分析的部分,就下一篇文章吧。