\xe8\xb0\xa3\xe8\xa8\x80 \ 中文乱码转换:
'\xe8\xb0\xa3\xe8\xa8\x80'.encode('raw_unicode_escape').decode("utf-8")
#本文通过新的方式爬取突破
- 由于公司列表页信息很少反爬,除了公司名称其他信息都没有,所以可以取巧提取注册时间注册资本信息
- 访问过多过快也会封,测试可以通过随机UA突破
- 另外公司具体信息详情页可能不同公司展示xpath位置不一样,所以用re
- 另外经营范围下载回来出现&#x开头的乱码,#&#xxx 的格式其实是unicode,用HTMLParser库解析
#最后实现结果:基本3秒能查出5家相关企业具体公司信息
**更新下,天眼查有更新,现在可以通过企业名和工商号或者纳税号进行查询
import requests
from lxml import etree
import random
import re
# import HTMLParser
from html.parser import HTMLParserproxy = {"http": 'http://125.70.13.77:8080',"http": 'https://183.6.129.212:41949'
}
USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)
print(dd)
headers={"Referer": "https://www.baidu.com/","User-Agent": "%s"%dd
}
def down_load(url):cc=requests.get(url=url,headers=headers,proxies=proxy)# cc=etree.HTML(cc)# cc.encode("utf-8").decode("utf-8")cc.encoding="utf-8"return cc.texti=input("请输入企业相关信息(企业名、工商号或纳税人号):")
first_url="https://m.tianyancha.com/search?key=%s"%i
# first_url="http://www.baidu.com"
a=down_load(first_url)
a=etree.HTML(a)
detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')
boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')
the_registered_capital=a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')
the_registered_time=a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')
# print(detail_url,boss,the_registered_capital,the_registered_time)gs=[]
gs1={}
for ii in range(len(boss)):aa=down_load(detail_url[ii])bb=etree.HTML(aa)company=bb.xpath('//div[@class="over-hide"]/div/text()')[0]industry = re.findall("行业:</span><span>(.*?)</span></div>",aa,re.S)[0]the_enterprise_type = re.findall("企业类型:</span><span>(.*?)</span></div>",aa,re.S)[0]registration_number = re.findall("工商注册号:</span><span>(.*?)</span></div>",aa,re.S)[0]organization_code = re.findall("组织结构代码:</span><span>(.*?)</span></div>",aa,re.S)[0]credit_code = re.findall("统一信用代码:</span><span>(.*?)</span></div>",aa,re.S)[0]business_period = re.findall("经营期限:</span><span>(.*?)</span></div>",aa,re.S)[0]# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]registration_authority =re.findall("登记机关:</span><span>(.*?)</span></div>",aa,re.S)[0]registered_address =re.findall("注册地址:</span><span>(.*?)</span></div>",aa,re.S)[0]scope_of_business =re.findall('<text class="tyc-num">(.*?)</text>',aa,re.S)[0]h=HTMLParser() #&#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法scope_of_business=h.unescape(scope_of_business)new=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]# print(new)gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]gs.append(new)
print(gs)
# print(gs1)
也可以参考看下上篇:通过scrapy结合selenium抓取天眼查
后续通过tkinter制作的可视化界面
import tkinter as tk
import requests
from lxml import etree
import random
import re
from html.parser import HTMLParserwindow=tk.Tk()
window.title("公司信息查询器")
window.geometry("790x550+500+200")l=tk.Label(window,text="企业名、工商号或纳税人号:",font="微软雅黑 11",height=2)
l.grid()
l1=tk.Label(window,text="这就是为你查询的结果:",font="微软雅黑 11",height=2)
l1.grid()var=tk.StringVar()e=tk.Entry(window,width=62)
e.grid(row=0,column=1)
e1=tk.Text(window,height=30)
# e1=tk.Entry(window,textvariable=var,width=60,)
e1.grid(row=2,column=1)def click():content=e.get()proxy = {"http": 'http://125.70.13.77:8080',"http": 'https://183.6.129.212:41949'}USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",]dd = random.choice(USER_AGENTS)headers = {"Referer": "https://www.baidu.com/","User-Agent": "%s" % dd}def down_load(url):cc = requests.get(url=url, headers=headers, proxies=proxy)# cc=etree.HTML(cc)# cc.encode("utf-8").decode("utf-8")cc.encoding = "utf-8"return cc.text# i = input("请输入企业相关信息(企业名、工商号或纳税人号):")first_url = "https://m.tianyancha.com/search?key=%s" % content# first_url="http://www.baidu.com"a = down_load(first_url)a = etree.HTML(a)detail_url = a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')# print(detail_url,boss,the_registered_capital,the_registered_time)gs = []gs1 = {}for ii in range(len(boss)):aa = down_load(detail_url[ii])bb = etree.HTML(aa)company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]industry = re.findall("行业:</span><span>(.*?)</span></div>", aa, re.S)[0]the_enterprise_type = re.findall("企业类型:</span><span>(.*?)</span></div>", aa, re.S)[0]registration_number = re.findall("工商注册号:</span><span>(.*?)</span></div>", aa, re.S)[0]organization_code = re.findall("组织结构代码:</span><span>(.*?)</span></div>", aa, re.S)[0]credit_code = re.findall("统一信用代码:</span><span>(.*?)</span></div>", aa, re.S)[0]business_period = re.findall("经营期限:</span><span>(.*?)</span></div>", aa, re.S)[0]# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]registration_authority = re.findall("登记机关:</span><span>(.*?)</span></div>", aa, re.S)[0]registered_address = re.findall("注册地址:</span><span>(.*?)</span></div>", aa, re.S)[0]scope_of_business = re.findall('<text class="tyc-num">(.*?)</text>', aa, re.S)[0]h = HTMLParser() # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法scope_of_business = h.unescape(scope_of_business)new = [ii+1,"公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii],"注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type, "工商注册号:" + registration_number,"组织结构代码:" + organization_code, "统一信用代码:" + credit_code, "经营年限:" + business_period,"登记机关:" + registration_authority, "注册地址:" + registered_address, "经营范围:" + scope_of_business]gs1[ii + 1] = ["公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii],"注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type,"工商注册号:" + registration_number, "组织结构代码:" + organization_code, "统一信用代码:" + credit_code,"经营年限:" + business_period, "登记机关:" + registration_authority, "注册地址:" + registered_address,"经营范围:" + scope_of_business]e1.insert("end", new)e1.insert("end", "\n\n") #换行骚操作# gs.append(new)# print(gs)# bb=response["translateResult"][0][0]["tgt"]# print(bb)# print(type(bb))# e1.insert("end",gs)
b=tk.Button(window,text="点击查询",command=click,width=10,font="微软雅黑 12")
b.grid(row=6,column=0)
b1=tk.Button(window,text="退出",command=window.quit,width=10,font="微软雅黑 12")
b1.grid(row=6,column=1)window.mainloop()
后续有目标的爬取1000来家公司信息,进行了一定的改造
1.padas转化成list—
#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()
2.大批量数据程序的异常判断问题 try,不然程序中途爬的全部丢失重头来过,主要两个方面去重和断点续爬
import requests
from lxml import etree
import random
import re
import csv
import pandas as pd
# from multiprocess import Pool
# import HTMLParser
from html.parser import HTMLParser
from fake_useragent import UserAgent# ua=UserAgent()proxy = {"https": 'https://114.116.10.21:3128',"http": 'http://47.105.151.97:80',"https": 'https://113.200.56.13:8010',"https": 'https://14.20.235.220:9797',"https": 'https://119.31.210.170:7777',"http": 'http://221.193.222.7:8060',"http": 'http://115.223.222.206:9000',"http": 'http://106.12.3.84:80',"http": 'http://49.81.125.62:9000',"http": 'http://119.29.26.242:8080',"http": 'http://118.24.98.96:9999',"http": 'http://183.129.207.84:52264',"http": 'http://121.10.71.82:8118',"http": 'http://113.16.160.101:8118',
}
USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)headers={"Referer": "https://www.baidu.com/","User-Agent": "%s"%dd,# "User-Agent":ua.random
}
def down_load(url):cc=requests.get(url=url,headers=headers,proxies=proxy,verify=True) #,proxies=proxy,verify=True# cc=etree.HTML(cc)# cc.encode("utf-8").decode("utf-8")cc.encoding="utf-8"return cc.text
#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()gs = []for m in range(1101,len(content)+1):i=content[m][0]# i=input("请输入企业相关信息(企业名、工商号或纳税人号):")first_url="https://m.tianyancha.com/search?key=%s"%i# first_url="http://www.baidu.com"a=down_load(first_url)a=etree.HTML(a)detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')[0]# boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]if a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="未公开" or a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="仍注册" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0] =="-" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0] =="-":passelse:the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0]the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0]boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]print(detail_url)aa = down_load(detail_url)bb = etree.HTML(aa)try:company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]# industry = re.findall("行业:</span><span>(.*?)</span></div>", aa, re.S)[0]the_enterprise_type = re.findall("企业类型:</span><span>(.*?)</span></div>", aa, re.S)[0]registration_number = re.findall("工商注册号:</span><span>(.*?)</span></div>", aa, re.S)[0]organization_code = re.findall("组织结构代码:</span><span>(.*?)</span></div>", aa, re.S)[0]credit_code = re.findall("统一信用代码:</span><span>(.*?)</span></div>", aa, re.S)[0]business_period = re.findall("经营期限:</span><span>(.*?)</span></div>", aa, re.S)[0]# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]registration_authority = re.findall("登记机关:</span><span>(.*?)</span></div>", aa, re.S)[0]registered_address = re.findall("注册地址:</span><span>(.*?)</span></div>", aa, re.S)[0]scope_of_business = re.findall('<text class="tyc-num">(.*?)</text>', aa, re.S)[0]h = HTMLParser() # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法scope_of_business = h.unescape(scope_of_business)new = [str(m+1),company,boss, the_registered_time,the_registered_capital,the_enterprise_type,registration_number,organization_code,credit_code, business_period,registration_authority,registered_address,scope_of_business]print(m+1)# gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]gs.append(new)raise exception #抛出异常except:with open("5006663.csv", "w", encoding="utf-8",newline="") as f:k = csv.writer(f, dialect="excel")k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])for list in gs:k.writerow(list)with open("500666666666.csv", "w", encoding="utf-8",newline="") as f:k = csv.writer(f, dialect="excel")k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])for list in gs:k.writerow(list)
# print(gs)# print(gs1)