系统首页
天气数据分析
历史天气数据查询
import requests
import re
import time as delay
from bs4 import BeautifulSoup
import pandas as pd
import pymysql
import json# 定义一个函数,用于获取网页的源代码
def get_page(url, headers):html = requests.get(url, headers=headers)if html.status_code == 200:html.encoding = html.apparent_encodingreturn html.textelse:print("Failed to fetch page:", url) # 添加调试语句return None# 定义一个函数,用于解析网页中的数据,并返回一个数据框
def parse_page(html):# 创建空列表,用于存储数据date_box=[]max_temp=[]min_temp=[]weh=[]wind=[]# 使用 BeautifulSoup 解析网页bs=BeautifulSoup(html,'html.parser')# 找到包含数据的标签data=bs.find_all(class_='thrui')# 使用正则表达式提取数据date = re.compile('class="th200">(.*?)</div>')tem = re.compile('class="th140">(.*?)</div>')time = re.findall(date, str(data))for item in time:# 不提取星期数据date_box.append(item[:10])temp = re.findall(tem, str(data))for i in range(len(temp) // 4):max_temp.append(temp[i * 4 + 0])min_temp.append(temp[i * 4 + 1])weh.append(temp[i * 4 + 2])wind.append(temp[i * 4 + 3])# 将数据转换为数据框,不添加星期列datas=pd.DataFrame({'日期':date_box,'最高温度':max_temp,'最低温度':min_temp,'天气':weh,'风向':wind })return datas# 定义一个函数,用于将数据框中的数据保存到 mysql 数据表中
def save_to_mysql(datas, city, db, cursor): # 添加城市参数for i in range(len(datas)):date = datas.iloc[i, 0]max_temp = float(re.findall(r'\d+', datas.iloc[i, 1])[0])min_temp = float(re.findall(r'\d+', datas.iloc[i, 2])[0])weh = datas.iloc[i, 3]wind = datas.iloc[i, 4]# 在 SQL 语句中包含城市信息sql = "INSERT INTO lishiweathers (城市, 日期, 最高温度, 最低温度, 天气, 风向) VALUES ('{}', '{}', {}, {}, '{}', '{}')".format(city, date, max_temp, min_temp, weh, wind)cursor.execute(sql)db.commit()print("成功将数据保存到 lishiweathers 表中")def crawl_weather(city, code, start, end, db, cursor):# 根据城市的编码和时间范围,生成网页的 urlurl = "http://lishi.tianqi.com/{}/{}{}.html".format(code, start, end)# 定义请求头,模拟浏览器访问headers = { # 请求标头,通过模拟请求标头以此实现仿人类登录进入网站并获取信息'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Cookie': ******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************}try:# 获取网页的源代码html = get_page(url, headers)# 解析网页中的数据,并返回一个数据框datas = parse_page(html)# 将数据框中的数据保存到 mysql 数据表中save_to_mysql(datas, city, db, cursor) # 传递 cursor 参数# 打印提示信息,不打印星期数据,也不将其作为输出的格式print("成功爬取 {} 的 {}-{} 的历史天气数据".format(city, start, end))except Exception as e:print("爬取 {} 的 {}-{} 历史天气数据失败:{}".format(city, start, end, e))# 从本文件夹下的 city.json 文件中读取城市名称和编码
with open('city.json', 'r', encoding='utf-8') as f:city_dict = json.load(f) # 将json文件转换为字典# 定义一个列表,存储需要爬取的时间范围
# 您可以根据您的要求修改或添加时间
#time_list = ["202201","202202","202203","202204","202205","202206","202207","202208","202209","202210","202211","202212","202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402"]
time_list = ["202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402","202403","202404","202405","202406","202407","202408","202409","202410","202411","202412","202501","202502"]# 打开数据库连接
# 您需要提供您的数据库的主机名,用户名,密码,端口号和数据库名
db = pymysql.connect(host="localhost",user="root",passwd="123456",port=3306,db="tianqi")
# 创建一个游标对象
cursor = db.cursor()# 遍历城市字典和时间列表,调用爬取函数
for city, code in city_dict.items():for time in time_list:start = time[:6]end = time[6:]crawl_weather(city, code, start, end, db, cursor) # 传递 cursor 参数delay.sleep(5)
# 关闭数据库连接
db.close()
数据库设计