配置爬取app的东西看此博客
首先手机设置代理,我设置的是手动代理,然后把端口名写为8888和主机名是Ipv4的地址,电脑上的fiddler也设置好就可以开始抓包了
连接好以后,手机打开汽车之家app,点击二手车车源,fiddler抓到包如图(记得要先点一下那个黄条对数据进行解压)
然后发现手机端的这个数据的结构是及其的简单清晰。
请求头用手机的请求头,然后就可以爬取了
import requests
from lxml import etree
import pandas as pd
import csv
import osdef get_html(url):headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 9; ONEPLUS A6000 Build/PKQ1.180716.001; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/71.0.3578.99 Mobile Safari/537.36 autohomeapp/1.1+%28auto_android%3B9.9.3%3BT1sFhhD74d5Lzdqfy68A54uWVQQFg-_zy6ygMUcD9mIqrPuMbFuGWA%3B9%3BONEPLUS%2BA6000%3B4e9165e6a117c54ba41734298ae9b5ae%29 auto_android/9.9.3 nettype/wifi',}try:response = requests.get(url, headers=headers)if response.status_code == 200:return response.content.decode()else:print('1')return Noneexcept:print('2')return Nonedef get_info(html):selector = etree.HTML(html)a_list = selector.xpath('//li/a')info_list = list()for a in a_list:try:title = a.xpath('h3/text()')[0]where = a.xpath('p/text()')[0].strip().split('/')[0]gone = a.xpath('p/text()')[0].strip().split('/')[1]release_time = a.xpath('p/text()')[0].strip().split('/')[2]infos = a.xpath('ins//text()')price = infos[0] + infos[1]infos[:2] = []#对列表前两个元素进行删除 advantage = ','.join(infos) if len(infos) != 0 else '无'list_one = [title, price, where, gone, release_time, advantage]print(list_one)info_list.append(list_one)except:passwrite_file(info_list)def write_file(info_list):file_size = os.path.getsize('D:/汽车之家2.csv')if file_size == 0:first = ['车名', '价格', '所在地', '已走路程', '购买年份', '好处']writer = pd.DataFrame(columns=first, data=info_list)writer.to_csv('D:/汽车之家2.csv', encoding='utf-8',index=False)else:with open('D:/汽车之家2.csv', 'a+', encoding='utf-8', newline='') as fp:writer = csv.writer(fp)writer.writerows(info_list)if __name__ == '__main__':urls = ['https://m.che168.com/app/quanguo/list/?sourcename=mainapp&safe={}&pvareaid=106365'.format(i) for i in range(1,21)]for url in urls:html = get_html(url)get_info(html)
最后结果