文章分为单线程和多线程两个部分,选择单线程一个一个下载速度会很慢,多线程下载可以明显提升速度。但先用单线程写出代码,再在此基础上改动成多线程,思路会更加清晰,对初学者也更加友好!
单线程下载王者荣耀壁纸
对界面的了解
进入王者荣耀官网https://pvp.qq.com/在游戏资料下点击游戏壁纸进入壁纸界面
下滑观察一共有页24页
右键检查发现图片都在<div id =“Work_List_Container_267733”中,下面的每一个<div class="class="p_newhero_item"代表一种图片
但这些图片并不在源代码中,而是保存在worklist……的json文件中,蓝色的部分才是真实的url
Response里存在着json内容,用json解析器看一下(注意这里需要把开头的jQuery17104507762352752318_1610428368416(和结尾的)去掉,不然解析会出错)
单线程代码如下:
# 1、通过https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback=jQuery17109837407810303742_1610418917807&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1610418918008
# 可以获取到高清壁纸的url
# 2、获取到高清壁纸的url后,通过parse.unquote可以进行解码,然后将最后的200变成0,就可以得到真实的高清壁纸图片了
# 3、获取图片的url中有一个page参数,通过修改page的值可以进行翻页,默认page是从0开始的
# 4、page只有24页从0到23import requests
from urllib import parse
from urllib import request
import osheaders = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36","cookie": "RK=/OpZ4Njjsp; ptcz=394043015c564423726c920b11083ee23442f2df97f4f143dc6951cbe15e63ef; pgv_pvi=6714924032; Qs_lvt_323937=1610418648; Qs_pv_323937=2275171914401574400; LW_sid=n1G6C170G4V148R6F5T9G635c6; LW_uid=z116s1w0O4K198w6j5q9P6F5I9; eas_sid=H116U1I0p4J1P8W6459966S7g1; pgv_info=ssid=s2693756474; pgv_pvid=7928325688; pvpqqcomrouteLine=index_wallpaper_wallpaper_wallpaper_wallpaper"} # 使用代码是需要换用你浏览器的user-agent和cookiedef extract_images(data): # 对8种分辨率的图片url进行解码image_urls = []for x in range(1,9):img_url = parse.unquote(data['sProdImgNo_%d'%x]).replace("200", "0")image_urls.append(img_url)return image_urlsdef main():page_url = "https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=1&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1610418918008" # 去掉jsoncallback=jQuery17109837407810303742_1610418917807&参数,和前面的url比一下就知道了resp = requests.get(page_url,headers=headers)result = resp.json() # 把json变成字典类型datas = result['List']for data in datas:image_urls = extract_images(data)name = parse.unquote(data['sProdName'])dirpath = os.path.join("images",name) # 需要在目录中新建image文件夹用于保存图片# images/孙悟空-零号·雷霆os.mkdir(dirpath)for index,image_url in enumerate(image_urls):request.urlretrieve(image_url,os.path.join(dirpath,"%d.jpg"%(index+1)))print("%s下载完成!"%(image_url))if __name__ == '__main__':main()
多线程下载王者荣耀壁纸
采用生产者消费者模型创建多线程,建立生产者和消费者类,在原有单线程的代码上略有改动,改动部分在代码注释说明
多线程代码如下:
import requests
from urllib import parse
from urllib import request
import os
import threading
import queueheaders = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36","cookie": "RK=/OpZ4Njjsp; ptcz=394043015c564423726c920b11083ee23442f2df97f4f143dc6951cbe15e63ef; pgv_pvi=6714924032; Qs_lvt_323937=1610418648; Qs_pv_323937=2275171914401574400; LW_sid=n1G6C170G4V148R6F5T9G635c6; LW_uid=z116s1w0O4K198w6j5q9P6F5I9; eas_sid=H116U1I0p4J1P8W6459966S7g1; pgv_info=ssid=s2693756474; pgv_pvid=7928325688; pvpqqcomrouteLine=index_wallpaper_wallpaper_wallpaper_wallpaper"} # 这个和单线程一样,运行前改成自己浏览器的class Producer(threading.Thread):def __init__(self,page_queue,image_queue,*args,**kwargs): # 传参必备super(Producer, self).__init__(*args,**kwargs)self.page_queue = page_queueself.image_queue = image_queuedef run(self) -> None:while not self.page_queue.empty():page_url = self.page_queue.get()resp = requests.get(page_url, headers=headers)result = resp.json()datas = result['List']for data in datas:image_urls = extract_images(data)name = parse.unquote(data['sProdName'])dir_path = os.path.join("images", name).replace("1:1","").replace(" ","") #这个是因为有的图片命名有:和空格,不能作为文件名,把它们替换成空字符串# images/孙悟空-零号·雷霆if not os.path.exists(dir_path): # 同名图片会报错,所以这里加一个条件os.mkdir(dir_path)for index,image_url in enumerate(image_urls):self.image_queue.put({"image_url":image_url,"image_path":os.path.join(dir_path, "%d.jpg" % (index + 1))})class Consumer(threading.Thread):def __init__(self,image_queue,*args,**kwargs):super(Consumer, self).__init__(*args,**kwargs)self.image_queue = image_queuedef run(self) -> None:while True:try:image_obj = self.image_queue.get(timeout=10)image_url = image_obj.get("image_url")image_path = image_obj.get("image_path")try:request.urlretrieve(image_url, image_path)print(image_path + "下载完成!")except:print(image_path + "下载失败!") # 这个是为了增强代码的健壮性except:breakdef extract_images(data):image_urls = []for x in range(1,9):img_url = parse.unquote(data['sProdImgNo_%d'%x]).replace("200", "0")image_urls.append(img_url)return image_urlsdef main():page_queue = queue.Queue(23)image_queue = queue.Queue(1000)for x in range(0,23):page_url = "https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1610418918008".format(page=x)page_queue.put(page_url)for x in range(3):th = Producer(page_queue,image_queue,name="生产者%d号"%x)th.start()for x in range(5):th = Consumer(image_queue,name="消费者%d号"%x)th.start()if __name__ == '__main__':main()
最后说明,别忘了在目录中创建image文件夹
爬取完成的效果:
希望对大家有所帮助❥(^_-)