输入企业名称,爬取企业地址,你也可以添加其他信息进去。
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 23 14:11:50 2019@author: Administrator
"""import requests
import lxml
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
import random
from pyquery import PyQuery as pqdef get_user_agent():user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]uer_agent = random.choice(user_agent_list)return uer_agent
headers = {'Host':'www.qichacha.com','Connection': 'keep-alive','Accept':r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','X-Requested-With': 'XMLHttpRequest','User-Agent':get_user_agent(),'Referer': 'https://www.qichacha.com/search?key=%E5%B9%BF%E4%B8%9C%E6%83%A0%E5%AE%A0%E5%AE%A0%E7%89%A9%E7%94%A8%E5%93%81%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Cookie':r'QCCSESSID=hjn1j0dpcjv5odsb6ropihka93; zg_did=%7B%22did%22%3A%20%2216c1d9ae9c037c-05a6ce5f67bf25-454c092b-1fa400-16c1d9ae9c1265%22%7D; hasShow=1; acw_tc=3da0cc9815638647858958978e47b118e1822ab9bb75de43e1ea41d84f; acw_sc__v2=5d36aed169dcad5ff9db49c113037e55f6619054; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201563864787401%2C%22updated%22%3A%201563864824337%2C%22info%22%3A%201563864787411%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%2C%22cuid%22%3A%20%2281402a1a8c4137bae6e4a2d48a37cee6%22%7D',}def Html():list=['城市宠物医院有限公司',
'索尼奇宠物美容屋有限公司',
'香港宠物美容师协会有限公司',
'甜蜜屋专业宠物美容有限公司',
'家宝宠物美容及用品有限公司',
'宝罗国际宠物美容股份有限公司',
'宝罗国际宠物美容股份有限公司-北屯分公司',
'宝罗国际宠物美容股份有限公司-新庄分公司',
'徐州优派特宠物食品有限公司',
'江西一起爱它宠物食品有限公司',
'聊城市宠物食品协会',
'江苏虹宠宠物用品有限公司',]for i in range(len(list)):url=r'https://www.qichacha.com/search?key={}'.format(list[i])try:response = requests.get(url,headers = headers)if response.status_code != 200:response.encoding = 'utf-8'print(response.status_code)print('ERROR') soup = pq(response.text)com_all_info = soup.find(".m_srchList")except Exception:print('请求都不让,这企查查是想逆天吗???')try:com_all_info_array = com_all_info.find("tr")print('开始爬取数据,请勿打开excel')for tr in com_all_info_array.items():mtxs = tr.find(".m-t-xs")temp_g_name = tr.find(".ma_h1").text() #获取公司名temp_g_addr = mtxs.eq(2).text() #获取公司地址print(temp_g_name,temp_g_addr)# except Exception:
# print('错误!')except Exception:print('好像被拒绝访问了呢...请稍后再试叭...') return url# if x == 0:
# re = 'http://www.qichacha.com/search?key='+key_word
# else:
# re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)if __name__ == '__main__':Html()