一、python实现app数据抓取需求
1、分析豆果美食数据包
2、通过python多线程-线程池抓取数据
3、通过使用代理ip隐藏爬虫
4、将数据保存到 mongodb 中
handle_mongo.py
import pymongo
from pymongo.collection import Collectionclass Connect_mongo(object):def __init__(self):self.client = pymongo.MongoClient(host="127.0.0.1", port=27017)self.db_data = self.client["dou_guo_mei_shi"]def insert_item(self, item):db_collection = Collection(self.db_data, "dgms_item")db_collection.insert(item)mongo_info = Connect_mongo()
spider_dgms.py
import requests
import json
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor# 数据队列
queue_list = Queue()# 请求数据方法
def handel_request(url, data):header = {# "Cookie": "duid=69270019","client": "4","version": "7106.2",# "channel": "baidu","act-code": "1637324809","act-timestamp": "1637324809","pset": "1",# "pseudo-id": "44c57e66cae004c9","device": "SM-N976N","brand": "samsung","sdk": "25,7.1.2","resolution": "1280*720","dpi": "1.5","timezone": "28800","language": "zh","cns": "2","imsi": "460071317077478","uuid": "f4d26323-9b23-403c-9187-e662ba7fc470","User-Agent": "Mozilla/5.0 (Linux; Android 7.1.2; SM-N976N Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36","battery-level": "0.98","battery-state": "3","caid": "44c57e66cae004c9","bssid": "AC:22:0B:07:74:4E","display-resolution": "1280*720","scale": "1.5","reach": "1","rom-version": "d2que-user 7.1.2 QP1A.190711.020 700211101 release-keys","syscmp-time": "1635765679000","countrycode": "CN","sysmemory": "3186032640","sysdisksize": "61.39 GB","terms-accepted": "1","newbie": "1","app-state": "0","bootmark": "822206a5-4ae7-412f-8e85-601e91ff3d10","updatemark": "1635817290.809861000","Content-Type": "application/x-www-form-urlencoded; charset=utf-8","Accept-Encoding": "gzip, deflate","Connection": "Keep-Alive","session-info": "O0y/dPYTJAm0BbGoew14b/fQctKsmmjzQdUMA09SAmiCS7mYzKaP/73rPRSMQ2uIW+v7/szNkhJjhWHJizw+1luYaLUqgMlU1ieRCbekVZspYwlXLyRowX2oEkhJ0MIj","Host": "api.douguo.net",# "Content-Length": "179",}# 设置代理# proxies = {"http": "127.0.0.1:8888"} # 通用代理# response = requests.post(url=url, headers=header, data=data, proxies=proxies)response = requests.post(url=url, headers=header, data=data)return response# 请求菜谱分类页面
def handle_index():url = "https://api.douguo.net/recipe/flatcatalogs"data = {"client": "4",# "_session": "1637373525108351564145807749",# "v": "new1637324615","_vs": "0","sign_ran": "e53fd78533e564209a573d14bd449d83","code": "a8665c30af67ce0c",}response = handel_request(url, data)# 解析数据index_response_dict = json.loads(response.text)for index_item in index_response_dict["result"]["cs"]:# print(index_item["name"])for index_item_1 in index_item["cs"]:data_2 = {"client": "4",# "_session": "1637373525108351564145807749","keyword": index_item_1["name"],"order": "0","_vs": "400","type": "0","auto_play_mode": "2","sign_ran": "1ce60f6319b32e96194116a84c331275","code": "bf2b7920137d77f9",}queue_list.put(data_2)# print("----->",index_item_1["name"])# print(response.text)# 获取菜谱列表
def handle_caipu_list(data):print("当前处理的食材:", data["keyword"])caipu_list_url = "https://api.douguo.net/recipe/v2/search/0/20"caipu_list_response = handel_request(url=caipu_list_url, data=data)# print(caipu_list_response.text)caipu_list_response_dict = json.loads(caipu_list_response.text)for item in caipu_list_response_dict["result"]["list"]:caipu_info = {}caipu_info["shicai"] = data["keyword"]if item["type"] == 13:caipu_info["user_name"] = item["r"]["an"]caipu_info["shicai_id"] = item["r"]["id"]caipu_info["describe"] = item['r']['cookstory']caipu_info["caipu_name"] = item['r']['n']caipu_info["zuoliao_list"] = item['r']['major']# print(caipu_info)# 请求详细做法信息detail_url = "https://api.douguo.net/recipe/v2/detail/" + str(caipu_info["shicai_id"])detail_data = {"client": "4","_session": "1637373525108351564145807749","author_id": "0","_vs": "11102","_ext": '{"query":{"kw":' + caipu_info["shicai"] + ',"src":"11102","idx":"2","type":"13","id":' + str(caipu_info["shicai_id"]) + '"}}"',"is_new_user": "1","sign_ran": "f588cc28475995ac6398393f5007e4be","code": "584429579d7b207a",}detail_response = handel_request(detail_url, detail_data)detail_response_dict = json.loads(detail_response.text)caipu_info["tips"] = detail_response_dict["result"]["recipe"]["tips"]caipu_info["cook_step"] = detail_response_dict["result"]["recipe"]["cookstep"]print("当前入库菜谱是: ", caipu_info["caipu_name"])mongo_info.insert_item(caipu_info) # 插入数据到mongodbprint("插入完成")else:continue# print(item)if __name__ == '__main__':# 插入一个菜谱# handle_index()# handle_caipu_list(queue_list.get())# 多线程抓取数据handle_index()pool = ThreadPoolExecutor(max_workers = 20)while queue_list.qsize() > 0:pool.submit(handle_caipu_list, queue_list.get())