爬虫_app 4 app数据抓取入门

news/2024/10/17 18:13:27/

一、python实现app数据抓取需求

1、分析豆果美食数据包

2、通过python多线程-线程池抓取数据

3、通过使用代理ip隐藏爬虫

4、将数据保存到 mongodb 中

handle_mongo.py

import pymongo
from pymongo.collection import Collectionclass Connect_mongo(object):def __init__(self):self.client = pymongo.MongoClient(host="127.0.0.1", port=27017)self.db_data = self.client["dou_guo_mei_shi"]def insert_item(self, item):db_collection = Collection(self.db_data, "dgms_item")db_collection.insert(item)mongo_info = Connect_mongo()

spider_dgms.py

import requests
import json
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor# 数据队列
queue_list = Queue()# 请求数据方法
def handel_request(url, data):header = {# "Cookie": "duid=69270019","client": "4","version": "7106.2",# "channel": "baidu","act-code": "1637324809","act-timestamp": "1637324809","pset": "1",# "pseudo-id": "44c57e66cae004c9","device": "SM-N976N","brand": "samsung","sdk": "25,7.1.2","resolution": "1280*720","dpi": "1.5","timezone": "28800","language": "zh","cns": "2","imsi": "460071317077478","uuid": "f4d26323-9b23-403c-9187-e662ba7fc470","User-Agent": "Mozilla/5.0 (Linux; Android 7.1.2; SM-N976N Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36","battery-level": "0.98","battery-state": "3","caid": "44c57e66cae004c9","bssid": "AC:22:0B:07:74:4E","display-resolution": "1280*720","scale": "1.5","reach": "1","rom-version": "d2que-user 7.1.2 QP1A.190711.020 700211101 release-keys","syscmp-time": "1635765679000","countrycode": "CN","sysmemory": "3186032640","sysdisksize": "61.39 GB","terms-accepted": "1","newbie": "1","app-state": "0","bootmark": "822206a5-4ae7-412f-8e85-601e91ff3d10","updatemark": "1635817290.809861000","Content-Type": "application/x-www-form-urlencoded; charset=utf-8","Accept-Encoding": "gzip, deflate","Connection": "Keep-Alive","session-info": "O0y/dPYTJAm0BbGoew14b/fQctKsmmjzQdUMA09SAmiCS7mYzKaP/73rPRSMQ2uIW+v7/szNkhJjhWHJizw+1luYaLUqgMlU1ieRCbekVZspYwlXLyRowX2oEkhJ0MIj","Host": "api.douguo.net",# "Content-Length": "179",}# 设置代理# proxies = {"http": "127.0.0.1:8888"}  # 通用代理# response = requests.post(url=url, headers=header, data=data, proxies=proxies)response = requests.post(url=url, headers=header, data=data)return response# 请求菜谱分类页面
def handle_index():url = "https://api.douguo.net/recipe/flatcatalogs"data = {"client": "4",# "_session": "1637373525108351564145807749",# "v": "new1637324615","_vs": "0","sign_ran": "e53fd78533e564209a573d14bd449d83","code": "a8665c30af67ce0c",}response = handel_request(url, data)# 解析数据index_response_dict = json.loads(response.text)for index_item in index_response_dict["result"]["cs"]:# print(index_item["name"])for index_item_1 in index_item["cs"]:data_2 = {"client": "4",# "_session": "1637373525108351564145807749","keyword": index_item_1["name"],"order": "0","_vs": "400","type": "0","auto_play_mode": "2","sign_ran": "1ce60f6319b32e96194116a84c331275","code": "bf2b7920137d77f9",}queue_list.put(data_2)# print("----->",index_item_1["name"])# print(response.text)# 获取菜谱列表
def handle_caipu_list(data):print("当前处理的食材:", data["keyword"])caipu_list_url = "https://api.douguo.net/recipe/v2/search/0/20"caipu_list_response = handel_request(url=caipu_list_url, data=data)# print(caipu_list_response.text)caipu_list_response_dict = json.loads(caipu_list_response.text)for item in caipu_list_response_dict["result"]["list"]:caipu_info = {}caipu_info["shicai"] = data["keyword"]if item["type"] == 13:caipu_info["user_name"] = item["r"]["an"]caipu_info["shicai_id"] = item["r"]["id"]caipu_info["describe"] = item['r']['cookstory']caipu_info["caipu_name"] = item['r']['n']caipu_info["zuoliao_list"] = item['r']['major']# print(caipu_info)# 请求详细做法信息detail_url = "https://api.douguo.net/recipe/v2/detail/" + str(caipu_info["shicai_id"])detail_data = {"client": "4","_session": "1637373525108351564145807749","author_id": "0","_vs": "11102","_ext": '{"query":{"kw":' + caipu_info["shicai"] + ',"src":"11102","idx":"2","type":"13","id":' + str(caipu_info["shicai_id"]) + '"}}"',"is_new_user": "1","sign_ran": "f588cc28475995ac6398393f5007e4be","code": "584429579d7b207a",}detail_response = handel_request(detail_url, detail_data)detail_response_dict = json.loads(detail_response.text)caipu_info["tips"] = detail_response_dict["result"]["recipe"]["tips"]caipu_info["cook_step"] = detail_response_dict["result"]["recipe"]["cookstep"]print("当前入库菜谱是: ", caipu_info["caipu_name"])mongo_info.insert_item(caipu_info)  # 插入数据到mongodbprint("插入完成")else:continue# print(item)if __name__ == '__main__':# 插入一个菜谱# handle_index()# handle_caipu_list(queue_list.get())# 多线程抓取数据handle_index()pool = ThreadPoolExecutor(max_workers = 20)while queue_list.qsize() > 0:pool.submit(handle_caipu_list, queue_list.get())


http://www.ppmy.cn/news/394876.html

相关文章

深入了解viewport和px

先来罗列下学习移动页面重构的过程中可能看到过迷糊过放弃过的一些单位: px、pt、pc、sp、em、rem、dp、dip、ppi、dpi、ldpi、mdpi、hdpi、xhdpi、xxhdpi…… 接下来分类给大家介绍下: ① 什么是绝对长度单位?什么是相对长度单位&#xff1f…

移动端开发资源

使用Flexible实现手淘H5页面的终端适配(淘宝官方):https://github.com/amfe/article/issues/17 移动前端自适应解决方案和比较:http://caibaojian.com/mobile-responsive-example.html 如何适配不同分辨率和不同屏幕尺寸的手机&am…

三星sm-g7106com.android.mms,Android系统版本及其屏幕适配

Android设备分辨率趋势 对应关系表 资源文件夹 屏幕分辨率 类型 对应图标尺寸 屏幕密度1dp xhdpi超高分辨率 1280*720 WQVGA 96*96 320 2 hdpi 高分辨率480*800 WVGA 72*72 240 1.5 mdpi 中等分辨率 480*320 HVGA、VGA 48*48 160 1 ldpi 低分辨率 320*240 …

WRF后处理:模拟结果插值到站点

NCL在WRF模式的后处理中,有很多强大的函数以及现成易用的脚本,因此即使NCL官方不再继续更新,但NCL在WRF模式后处理中仍然不失为最合适的语言之一。本文以NCL为例,介绍如何将WRF模拟结果插值到站点,包括特定的高度层和气…

100集华为HCIE安全培训视频教材整理 | 防火墙用户管理与认证技术(一)

学习视频来源:《乾颐堂HCIP-HCIE-security安全 2019年录制》 V100只有认证和不认证V500有portal认证、短信认证、免认证、不认证认证策略只能针对于会话认证、免认证 需要手工配置用户的IP与MAC地址绑定

git push 报错[remote rejected] (failed to update ref)

在push代码的时候,报了一个[remote rejected] (failed to update ref)的错,简单记录下 解决方式1: git config remote.origin.push refs/heads/*:refs/for/* 再push,未成功 解决方式2: 观察git push日…

PVE的LXC容器系统安装CUPS软件,实现多平台共享的印表机

PVE的LXC容器系统安装CUPS软件,实现多平台共享的印表机 前言: 家里使用Canon LBP6030的USB印表机,连接住路由器(Youku1)的Padavan(老毛子)系统实现无线打印,但只能在Windows的平台实现无线打印,然而Apple的系统(如: macOS、IOS)就…