通过微信小程序(摩拜),填写请求头,数据,post方式传递给服务器获取response
反反爬虫:useragent轮转(手机useragent)、代理ip、休眠0.1s
代码分为两部分:多线程获取代理ip,多线程爬虫
一、多线程获取代理ip
from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup as bs
from urllib import request
import socket
import threading
import time#init timeout = 3
socket.setdefaulttimeout(5)
test_url = "http://ip.chinaz.com/getip.aspx"#request the xiciURL and get the response
def request_to_get(url):hearder = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9","Connection":"keep-alive","Host":"www.xicidaili.com","Referer":"http://www.xicidaili.com/","User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",}response = requests.get(url,headers=hearder).contentcontent = str(response,encoding = "utf-8")bs_obj = bs(content,"html.parser")return bs_obj#get ip port and return a list format:{"https://":"ip:port"}
def find_ip_port(bs_obj):ip_list = []port_list = []ips = bs_obj.findAll('tr')for x in range(1,len(ips)):ip = ips[x]tds = ip.findAll("td")ip_list.append(tds[1].text)port_list.append(tds[2].text)proxys = []for i in range(len(ip_list)):proxy_host = "http://"+ip_list[i]+":"+port_list[i]proxy_temp = {"http":proxy_host}proxys.append(proxy_temp)return proxys#check ip alright
def check_ip(alright_proxys,proxy):try:proxy_support = request.ProxyHandler(proxy)opener = request.build_opener(proxy_support)opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]request.install_opener(opener)response = request.urlopen(test_url).read()content = str(response,encoding = "utf-8")alright_proxys.append(proxy)#print(proxy)#print(content)#print("is alright")except Exception as e:#print(proxy)#print(e)pass#test the proxy and return proxy that can be used
def return_ok_proxys(proxys):alright_proxys = []for i in range(len(proxys)):t = threading.Thread(target = check_ip,args = (alright_proxys,proxys[i],))t.start()time.sleep(5)return alright_proxys#main function
def main_function():url = "http://www.xicidaili.com/nn/"bs_obj = request_to_get(url)proxys = find_ip_port(bs_obj)alright_proxys = return_ok_proxys(proxys)return alright_proxys
二、多线程爬虫
long、alt为经纬度,百度自己找范围,city_code为标准的城市编码,wxcode用假的即可,开始也不知道后来百度发现可以
import requests
import time
import threading
import random
from get_ip_pools import *url = "https://mwx.mobike.com/mobike-api/rent/nearbyBikesInfo.do"
bike_id = []
user_agents = ['Mozilla/5.0 (Linux; U; Android 5.1; zh-cn; m1 metal Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.6 Mobile Safari/537.36','Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baiduboxapp/8.6.5 (Baidu; P1 5.1.1)','Mozilla/5.0 (Linux; Android 6.0; MP1512 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.4 baiduboxapp/8.4 (Baidu; P1 6.0)','Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; X9007 Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.6 Mobile Safari/537.36','Mozilla/5.0 (iPhone 6s; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.6.0 Mobile/14E304 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1','Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; vivo Xplay6 Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.6 Mobile Safari/537.36','Mozilla/5.0 (Linux; Android 6.0.1; SM-A9000 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baiduboxapp/8.6.5 (Baidu; P1 6.0.1)','Mozilla/5.0 (Linux; Android 6.0.1; vivo X9Plus Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baiduboxapp/8.6.5 (Baidu; P1 6.0.1)','Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 MicroMessenger/6.5.9 NetType/WIFI Language/zh_CN','Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11t Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043307 Safari/537.36 MicroMessenger/6.5.8.1060 NetType/WIFI Language/zh_CN','Mozilla/5.0 (iPhone 6s; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 MQQBrowser/7.5.1 Mobile/13G36 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1']
t_count = 1def init_datas(user_agent,city_code,long_itude,lat_itude):header = {'host':'mwx.mobike.com','content-type':'application/x-www-form-urlencoded','opensrc':'list','moblieno':'','wxcode':'fuck_wxcode','platform':'3','accept-language':'zh-cn','subsource':'','lang':'zh','user-agent':'%s'%user_agent,'time':'%s'%str(int(time.time()*1000)),'citycode':'%s'%city_code,}datas = {'verticalAccuracy':10,'speed':-1,'horizontalAccuracy':65,'accuracy':65,'citycode':'getLocation:ok','citycode':'%s'%city_code,'wxcode':'fuck_wxcode','longitude': '%s'%long_itude,'latitude': '%s'%lat_itude,}return header,datasdef main(city_code,long,lat,ip):global url,bike_id,t_count#print("%s is excute"%t_count)one_agent = user_agents[random.randint(0,len(user_agents)-1)]header,datas = init_datas(one_agent,city_code,long,lat)data = requests.post(url,headers = header,data = datas,proxies = ip).contentimport jsondata = json.loads(str(data,encoding = 'utf-8'))obj = data['object']try:for i in obj:if i['distId'] not in bike_id:bike_id.append(i['distId'])print(i['distId'])except Exception as e:print(e)pass#print("%s is finish"%t_count)t_count += 1city_code = '010'
start_long = "116.250000000000"
start_alt = "39.910000000000"
end_long = "116.330000000000"
end_alt = "39.92000000000"
threads = []
thread_count = 0
ip_pools = main_function()for i in range(int(float(start_long)*2000),int(float(end_long)*2000)):for j in range(int(float(start_alt)*2000),int(float(end_alt)*2000)):long_itude = str(float(i)/2000.0) + "00000000000"lat_itude = str(float(j)/2000.0) + "00000000000"t = threading.Thread(target = main,args = (city_code,long_itude,lat_itude,ip_pools[random.randint(0,len(ip_pools)-1)],))t.start()time.sleep(0.1)threads.append(t)thread_count += 1if thread_count == 128:for t in threads:t.join()thread_count = 0threads = []
print(len(bike_id))
三、效果图
自己跑的找不到了懒得跑拿同学跑的吧(自己帮同学写的mmp我好大公无私)