最新的某131网站精美套图爬取代码出炉,截至2020年4月13日有效,之后就不知道啦。
来,各位看客老爷们可以搞一下:
import requests
import json
import re
import time
import osdef find_img_src(html): #这个函数其实没用到replace_pattern = r'<[meta|META].*?/>' img_url_pattern = r'.+?content="(\S+)"' img_url_list = []need_replace_list = re.findall(replace_pattern, html)for tag in need_replace_list:img_url_list.append(re.findall(img_url_pattern, tag)) return img_url_listdef find_set_span(html):paging_pattern = r'<div class="paging".*?</div>'span_pattern = r'<span.*?</span>'need_paging_list = re.findall(paging_pattern,html)# print(need_paging_list)for span_statement in need_paging_list:need_span_list = re.findall(span_pattern,span_statement)# print(need_span_list)final_pattern = r'>.*?<'span_number = re.findall(final_pattern,need_span_list[0])return (int(span_number[0][3:5]))headers = {"User-Agent": "Mozilla/5.0","referer":"https://m.mm131.net","Host":"m.mm131.net",
} for i in range(5300,5350): #这个套数ID数可以自己测试time.sleep(2)main_url = "https://m.mm131.net/xinggan/"+str(i)+".html"response = requests.get(main_url,headers = headers)if response.status_code == 200:try:# print(find_img_src(response.content.decode("gb2312"))[3],end="") #返回url列表span_number = find_set_span(response.text)print(main_url, end="")print(" 此套图页数:"+str(span_number))except:print(" 网页解码异常")#创建文件夹pic_path = r'D:\******\WWW\photo\mm131\%d' % (i)isExists = os.path.exists(pic_path)if not isExists:os.makedirs(pic_path)print(pic_path + " 创建成功")#爬取图片for sequence in range(1, span_number + 1):if sequence == 1:referer_url = "https://m.mm131.net/xinggan/" + str(i) + ".html"else:referer_url = "https://m.mm131.net/xinggan/" + str(i) + "_" + str(sequence) + ".html"# print("referer: "+referer_url + " ", end="")pic_headers = {"User-Agent": "Mozilla/5.0","referer": referer_url,"host": "img1.mmmw.net",}img_url = "https://img1.mmmw.net/pic/" + str(i) + "/" + str(sequence) + ".jpg"# print("img_url: "+img_url,end="")#下载图片pic_response = requests.get(img_url,headers = pic_headers)time.sleep(0.5)if (pic_response.content!=None):open(r'D:\******\WWW\photo\mm131\%d\%d.jpg' % (i,sequence),'wb').write(pic_response.content) # 将内容写入图片# print(" 图片已写入")else:print(pic_path + " 目录已存在")else:print(main_url+" 访问错误")