超清壁纸爬虫
- 概述
- 一、超清壁纸搜索并下载
概述
本文介绍如何使用python爬虫实现超清壁纸的下载。
思路:通过requests模块对网页发起url请求,使用xpath解析提取图片链接,将二进制数据保存至电脑桌面并创建对应的文件夹!
提示:以下是实现源码,仅供参考。
一、超清壁纸搜索并下载
import ostry:from lxml import etreeimport requests
except:print("正在安装必需的数据库, 请稍等...")os.popen(cmd='pip install lxml -i https://pypi.doubanio.com/simple/').read()os.popen(cmd='pip install requests -i https://pypi.doubanio.com/simple/').read()
else:passimport requests
from lxml import etreedef meitu_search_engine(keyword='美女', page_num = 1):""" 搜索美图, 并自动下载到当前目录 """global page_text_4, response_2, page_text_3url_1 = f"https://www.bizhizu.cn/search/{keyword}/{page_num}.html"headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"}response = requests.get(url=url_1, headers=headers, timeout=10) # 发起URL请求response.encoding = 'utf-8'# print("网页响应状态码:", response.status_code)if response.status_code == 200:print("\t网页访问成功!")# print("url:", response.url)page_text_1 = response.text# print(page_text_1)tree_1 = etree.HTML(page_text_1) # 实例化一个etree对象list_a = tree_1.xpath('//div[@class="imgcont"]/ul/li/a/text()')list_a_href = tree_1.xpath('//div[@class="imgcont"]/ul/li/a[2]/@href')# print(len(list_a), list_a)# print(len(list_a_href),list_a_href)list_meitu_source_code = []for link in list_a_href:# print(link)while True:try:response_2 = requests.get(url=link, headers=headers, timeout=3)except:continueelse:breakresponse_2.encoding = 'utf-8'page_text_2 = response_2.texttree_2 = etree.HTML(page_text_2)list_link_2 = tree_2.xpath('//p[@class="text_con" and @id="photoDesc"]/a[1]/@href')# print("list_link_2 = ", list_link_2)print("\t爬取图片地址链接:", list_link_2)while True:try:page_text_3 = requests.get(url=f"https://www.bizhizu.cn{list_link_2[0]}", headers=headers,timeout=3).textexcept:continueelse:break# print(page_text_3)tree_3 = etree.HTML(page_text_3)list_link_3 = tree_3.xpath('//div[@class="show-pages-imgshow"]/img/@src')# print("list_link_3 = ", list_link_3)while True:try:page_text_4 = requests.get(url=list_link_3[0], headers=headers, timeout=3).content # 访问图片源码, 以二进制返回except:continueelse:break# print(page_text_4)list_meitu_source_code.append(page_text_4)list_name = list_a# print(len(list_name))# print(len(list_meitu_source_code))print(f"\t爬取的照片数量 = {len(list_name)}")print(f"\t爬取的照片链接数量 = {len(list_meitu_source_code)}")for name, link in zip(list_name, list_meitu_source_code):with open(f"./{keyword}/{name}.jpg", mode='wb') as obj:obj.write(link)returndef check_floder(file_name):""" 检查文件夹是否存在, 否则创建它 """try:os.listdir(file_name)except:os.mkdir(file_name)else:passreturnif __name__ == '__main__':while True:while True:try:keyword = str(input("请输入搜索关键字:")).strip()if len(keyword) > 0:passelse:8/0except:print("\t您输入的不合法!")continueelse:breakwhile True:try:page_num = int(input("你打算取多少页图片啊?(>0):"))if page_num > 0:passelse:8/0except:print("\t您输入的不合法!")continueelse:breakcheck_floder(file_name=keyword)print('\t-------------------------------')for i in range(1, page_num+1, 1):print(f"\t当前爬取的是第{i}页")try:meitu_search_engine(keyword=keyword, page_num=i)except:print("\t没有搜到当前页的数据~")else:passprint('\t-------------------------------')select = input("\n\t您打算继续搜索更多的吗?(y/n):")if select in ['n', 'N', 'NO', 'No']:breakelse:print()continue