wallhaven.cc网站图片超清壁纸爬虫

测试时间:2021-02-16

- 1.参考博客
- 2.python代码

1.参考博客

From（侵删）：
https://blog.csdn.net/qq_41849471/article/details/89607706

2.python代码

图片保存路径：
save_dir = ‘C:/Users/Administrator/Pictures/wallpaper/’
缩略图

pycharm直接运行即可下载到该目录，其他参数可参考上述博客。

import requests
from lxml import etree
import re
import os
import timedef get_pictures(url,folder_name,dest_count,c):html = requests.get(url)res = etree.HTML(html.content)img_url = res.xpath('//img[@id="wallpaper"]/@src')[0]img_name = img_url.split('/')[-1]try:img_html = requests.get(img_url)save_dir = 'C:/Users/Administrator/Pictures/wallpaper/' + folder_nameif not os.path.exists(save_dir):os.mkdir(save_dir)with open(save_dir +'/'+img_name,'wb') as f:f.write(img_html.content)print("正在下载第 {} 张图片=====> ".format(c+1)+img_name+' -----success!')return 1except:print("正在下载第 {} 张图片=====> ".format(c+1)+img_name+' -----failure!')return 0def get_next_url(url,folder_name,stars_num,dest_count,all):html = requests.get(url)res = etree.HTML(html.content)next_urls = res.xpath("//a[@class='preview']/@href")stars = res.xpath("//div[@class='thumb-info']/a[1]/text()")res_url = []sum = allfor i in range(0,len(stars)):if int(stars[i])>=int(stars_num):res_url.append(next_urls[i])for i in res_url:sum += get_pictures(i,folder_name,dest_count,sum)if sum >= dest_count:exit("目标已达成！")if len(next_urls) == 0:print("无更多图片！")time.sleep(3)exit("0")return sumif __name__ == "__main__":print("请选择获取方式：1.范围选择 2.关键词搜索 3.二者结合")# style = input()style = '1'categories = ['0','0','0']purity = ['0','0','0']url = ""keyword = ""sort_list = [# 'https://wallhaven.cc/search?categories=101&purity=110&atleast=2560x1080&topRange=1M&sorting=toplist&order=desc&page={}','https://wallhaven.cc/search?categories={}&purity={}&atleast=2560x1080&ratios=16x9&topRange=1M&sorting=toplist&order=desc&page={}'# ,'https://wallhaven.cc/search?q={}&categories={}&purity={}&sorting=date_added&order=desc&page={}'#, 'https://alpha.wallhaven.cc/search?q={}&categories={}&purity={}&resolutions=1920x1080&topRange=1M&sorting=toplist&order=desc&page={}',# 'https://alpha.wallhaven.cc/search?q={}&categories={}&purity={}&resolutions=1920x1080&sorting=random&order=desc&page={}',# 'https://alpha.wallhaven.cc/search?q={}&search_image=&page={}']if style == '1' or style == '3':if style == '3':print("请输入搜索关键词(建议英文)：")keyword = input().replace(' ','+')print("请选择图片类型：1.General 2.Anime 3.People (可多选,默认全选,空格分割选项)")# selection_str = input()selection_str = ''selection = selection_str.split()for i in selection:try:categories[int(i)-1] = '1'except:categories = ['1','1','1']print("图片附加选项：1.SFW 2.Sketchy (可多选，默认选择1，空格分隔选项，建议选择SFW)")# selection_str = input()selection_str = ''selection = selection_str.split()for i in selection:try:purity[int(i)-1] = '1'except:purity = ['1','0','0']purity[2] = '0'if selection_str == "":purity = ['1','1','0']print("请选择排序方式：1.Latest 2.Toplist 3.Random (单选，默认Random)")# selection_str = input()selection_str = '2'count = 1while selection_str != '1' and selection_str != '2' and selection_str != '3' and count <= 3 and selection_str != "":print("请正确选择(多次错误则默认选择)")selection_str = input()count += 1if count == 4:url = sort_list[2]elif selection_str == "":url = sort_list[2]else:# url = sort_list[int(selection_str)-1]url = sort_list[0]elif style == '2':print("请输入搜索关键词(建议英文)：")keyword = input().replace(' ','+')url = sort_list[3]print("请输入文件夹的名称：")# folder_name = input()folder_name = 'wallhaven_wallpaper'while folder_name == "":folder_name = input()print("请输入最低的点赞数：")# stars_num = input()stars_num = 20print("请输入目标图片数量：")# dest_count = input()dest_count = 200all = 1 # 目前爬取的张数，用来控制下载张数for i in range(1,999):print('get the page: {}'.format(i))if style != '2':# print("getting from " + url.format(keyword,"".join(categories),"".join(purity),i))print("getting from " + url.format("".join(categories),"".join(purity),i))# all = get_next_url(url.format(keyword,"".join(categories),"".join(purity),i),folder_name,stars_num,int(dest_count),all)all = get_next_url(url.format("".join(categories),"".join(purity),i),folder_name,stars_num,int(dest_count),all)else:print("getting from " + url.format(keyword,i))all = get_next_url(url.format(keyword,i),folder_name,stars_num,int(dest_count),all)```