一、每题的源代码及注释说明 (主要代码)
(1)编写数据
采取代码,对乐有家里的广州城市进行数据爬取:
from selenium import webdriver
from lxml import etree
import requests
import time
import csv
# 使用selenium模拟网页打开,不使用点击下一页按钮。为了爬取效率直接跳到下一页
# 创建csv
file = open('original_data.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(file)
# 写入列标题
writer.writerow(
('楼盘名称', '区域', '单价', '本小区均价', '新区片均价', '总价', '面积', '房源编号', '户型', '朝向', '核心卖点', '户型简介', '交通配套', '城市')
)
……
if __name__ == '__main__':cities = ['shenzhen', 'guangzhou', 'zhuhai', 'foshan', 'dongguan', 'huizhou', 'zhaoqing', 'zhongshan', 'jiangmen']
chengshi = ['深圳', '广州', '珠海', '佛山', '东莞', '惠州', '肇庆', '中山', '江门']
for city, c in zip(cities, chengshi):
# 页数设置urls = ['https://' + city + '.leyoujia.com/esf/?n={}'.format(number) for number in range(1, 500)] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}# 隐式等待10秒,让请求的网页全部渲染完
driver = webdriver.Chrome()pg = 1for url in urls:driver.get(url)driver.implicitly_wait(10) dom = etree.HTML(driver.page_source,
etree.HTMLParser(encoding="utf-8"))
(2)数据集进行切分和清洗
alL_csv.to_csv('house_data.csv', index=False)
print('- Begin refine data')
with open('house_data.csv', 'r', encoding='utf-8') as f:reader = csv.reader(f)useful_data = []for i in reader:useful_data.append(i[:7] + i[8:10] + i[12:])with open('house_data.csv', 'w', encoding='utf-8',newline='') as w:writer = csv.writer(w)writer.writerows(useful_data)
(3)导入实验所需的依赖库
import os
import pyecharts
from pyecharts import options as opts
from regional_data import *
from conf import *
import pyecharts
from pyecharts import options as opts
import house_price.conf
from regional_data import *
from conf import *
from orientation_data import *
import csv
from conf import *
from house_utils import *
from tqdm import tqdm
import re
import numpy as np
from house_utils import select_by_name
import random
from convert_to_digital import *
from matplotlib import pyplot as plt
(4)制作自定义工具函数
import conf
import csv
def get_data():……
DATA_LIST = get_data()
def select_by_column(column_name: str, select_data=None):""":paramselect_data: find data in select_data:param column_name: column's name:return: column data in (column_name),type = list"""if select_data is None:select_data = DATA_LISTassert column_name in conf.TITLE, "title name not found!"selected_data = []index = conf.TITLE.index(column_name)for i in select_data:selected_data.append(i[index])return selected_data
def select_by_name(name: str):""":param name: city's name:return: all house data in (city_name),type = list"""global name_indexif name in conf.CITY_LIST:name_index = 10elif name in conf.ORIENTATION_LIST:name_index = 8elif name in conf.REGION_LIST:name_index = 1elif name in conf.HOUSE_TYPE_LIST:name_index = 7else:raise NameError(f'no name is {name}')selected_data = []for i in DATA_LIST:if name == i[name_index]:selected_data.append(i)return selected_data
def convert_traffic_to_digital(traffic_flag: str) -> int:if traffic_flag == '空':return 0return 1
if __name__ == '__main__':for i in select_by_name('朝南'):print(i)print('end')
(5)数据分析
#导入库
#----------------------------------------------------------------------
(1)条形图(城市间平均价格、中位数、标准差、平均面积、平均每平方米价格)
def gd_hist():value_bar = pyecharts.charts.Bar()value_bar.add_xaxis(conf.CITY_LIST)value_bar.add_yaxis('平均价格', [np.mean(zhongshan_price_lst).round(2),np.mean(zhuhai_price_lst).round(2),np.mean(dongguan_price_lst).round(2),np.mean(huizhou_price_lst).round(2),np.mean(zhaoqing_price_lst).round(2),np.mean(shenzhen_price_lst).round(2),np.mean(foshan_price_lst).round(2),np.mean(jiangmen_price_lst).round(2),np.mean(guangzhou_price_lst).round(2)],markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(name='Average house price', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('max', type_='max'), opts.MarkPointItem('min', type_='min')]))value_bar.add_yaxis('中位数', [……], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均中位数', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大中位数', type_='max'), opts.MarkPointItem('最小中位数', type_='min')]),)value_bar.add_yaxis('标准差', [……], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均方差', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大方差', type_='max'), opts.MarkPointItem('最小方差', type_='min')]))value_bar.add_yaxis('平均面积', [……], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均面积', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大平均面积', type_='max'),opts.MarkPointItem('最小平均面积', type_='min')]))value_bar.add_yaxis('每平方米平均价格', [……], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均每平米房价', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大平米均价', type_='max'), opts.MarkPointItem('最小平米均价', type_='min')]))return value_bar
# value_bar.render('html/all_analyze_hist.html')
print('- Figure value_bar has been completed!')
#----------------------------------------------------------------------
#(2)城市间房价箱线图
def box_fun():box_plot = pyecharts.charts.Boxplot()box_plot.add_xaxis(conf.CITY_LIST)box_plot.add_yaxis('箱型图', box_plot.prepare_data([list(map(lambda x: float(x), list(zhongshan_price_lst))),list(map(lambda x: float(x), list(zhuhai_price_lst))),list(map(lambda x: float(x), list(dongguan_price_lst))),list(map(lambda x: float(x), list(huizhou_price_lst))),list(map(lambda x: float(x), list(zhaoqing_price_lst))),list(map(lambda x: float(x), list(shenzhen_price_lst))),list(map(lambda x: float(x), list(foshan_price_lst))),list(map(lambda x: float(x), list(jiangmen_price_lst))),list(map(lambda x: float(x), list(guangzhou_price_lst)))]))box_plot.set_global_opts(title_opts=opts.TitleOpts(title="各城市房价箱型图"))……
# ---------------------------------------------------------------------
#(3)小区数量饼状图
def gd_num_pie_fun():gd_num_pie = pyecharts.charts.Pie()gd_num_pie.add('各市级小区数量', [['中山市', len(zhongshan)],['珠海市', len(zhuhai)],['东莞市', len(dongguan)],['惠州市', len(huizhou)],['肇庆市', len(zhaoqing)],['深圳市', len(shenzhen)],['佛山市', len(foshan)],['江门市', len(jiangmen)],['广州市', len(guangzhou)]])return gd_num_pie
# .render('html/广东小区数量.html')
print('- Figure gd_num_pie has been completed!')
#----------------------------------------------------------------------
#(4)朝向饼状图
def orientation_num_fun():orientation_num_pie = pyecharts.charts.Pie()# ['东北', '东南', '南北', '朝南', '朝北', '朝向暂无', '朝西', '朝东', '西北', '西南']orientation_num_pie.add('各朝向房源数量', [[i, j] for i, j in zip(ORIENTATION_LIST, orientation_num_lst)])……# ---------------------------------------------------------------------
#(5)朝向平均价格及价格中位数条形图
def orientation_bar_fun():orientation_bar = pyecharts.charts.Bar()orientation_bar.add_xaxis(ORIENTATION_LIST)# ['东北', '东南', '南北', '朝南', '朝北', '朝向暂无', '朝西', '朝东', '西北', '西南']orientation_bar.add_yaxis('该朝向平均价格(万/套)', [np.mean(northeast_price_lst).round(2),np.mean(southeast_price_lst).round(2),np.mean(southnorth_price_lst).round(2),np.mean(south_price_lst).round(2),np.mean(north_price_lst).round(2),np.mean(none_orientation_price_lst).round(2),np.mean(west_price_lst).round(2),np.mean(east_price_lst).round(2),np.mean(northwest_price_lst).round(2),np.mean(southwest_price_lst).round(2),], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均价格', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大均价', type_='max'),opts.MarkPointItem('最小均价', type_='min')]))orientation_bar.add_yaxis('该朝向价格中位数(万/套)', […… ], markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem('平均中位数', type_='average')]),markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem('最大中位数', type_='max'),opts.MarkPointItem('最小中位数', type_='min')]))return orientation_bar
……
# ---------------------------------------------------------------------
#(6)珠江三角洲部分区域小区数量的热力图
def region_hot_map_fun():region_hot_map = pyecharts.charts.Geo(init_opts=opts.InitOpts('1200px', '1500px'))region_hot_map.add_schema('广东')region_hot_map.add_coordinate('大涌镇', 113.306477, 22.469488)region_hot_map.add_coordinate('中堂镇', 113.663426, 23.098861)……region_hot_map.add_coordinate('东城区', 113.784069, 23.03539)region_hot_map.add_coordinate('道滘镇', 113.68167, 23.010235)region_hot_map.add_coordinate('洪梅镇', 113.68167, 23.010235)region_hot_map.add('各市级小区数量',[[i, int(j)] for i, j in zip(region_name_arr, region_ave_price_arr)],type_=globals.GeoType.HEATMAP)region_hot_map.set_global_opts(title_opts=opts.TitleOpts(title="广东省地图"), visualmap_opts=opts.VisualMapOpts(max_=1000, min_=0))return region_hot_map
……
# ---------------------------------------------------------------------
#(7)珠江三角洲部分区域的房价条方图
def region_hist_fun():region_hist = pyecharts.charts.Bar(init_opts=opts.InitOpts(height='6000px'))region_hist.add_xaxis(list(region_name_arr))region_hist.add_yaxis('各区县平均房价(万/套)', list(region_ave_price_arr), bar_min_width=10, bar_max_width=20,label_opts=opts.LabelOpts(position='right'))region_hist.add_yaxis('各区县房价中位数(万/套)', list(region_median_price_arr), bar_min_width=10, bar_max_width=20,label_opts=opts.LabelOpts(position='right'))region_hist.add_yaxis('各区县房屋平均面积(平方)', list(region_m_data_arr), bar_min_width=10, bar_max_width=20,label_opts=opts.LabelOpts(position='right'))region_hist.add_yaxis('各区县房屋一平方价格(元/平方)', list(region_m_price_arr), bar_min_width=10, bar_max_width=20,label_opts=opts.LabelOpts(position='right'))……
# ---------------------------------------------------------------------
#(8)词云
def word_cloud_fun():word_cloud = pyecharts.charts.WordCloud()word_cloud.add('城市词云', [[i, j] for i, j in zip(list(region_name_arr), list(region_ave_price_arr))])……# ---------------------------------------------------------------------
#(9)珠江三角洲部分区域平均房价,中位数,平均面积条方图
def house_type_bar_fun():house_type_bar = pyecharts.charts.Bar(init_opts=opts.InitOpts(height='6000px'))house_type_bar.add_xaxis(list(house_type_x_axis))house_type_bar.add_yaxis('各户型平均价格', list(house_type_mean_price_arr), bar_min_width=10, bar_max_width=20, label_opts=opts.LabelOpts(position='right'))house_type_bar.add_yaxis('各户型价格中位数', list(house_type_median_price_arr), bar_min_width=10, bar_max_width=20, label_opts=opts.LabelOpts(position='right'))house_type_bar.reversal_axis()……
# ---------------------------------------------------------------------
print('- draw end')
(6)房价拟合线性回归
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from convert_to_digital import *
from sklearn.tree import DecisionTreeRegressor
import pydotplus
import graphviz
from sklearn import tree as t
x = np.array(x, np.float32).round(2)
y = np.array(y, np.int)
#x = preprocessing.StandardScaler().fit(x).transform(x)
x, y = sklearn.utils.shuffle(x, y)
x, x_test, y, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
print('-------------------------------------')
line = LinearRegression()
tree = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
print('- Fitting...')
line.fit(x, y)
tree.fit(x, y)
print('- Fit end')
#预测数据
#['面积均价', '面积', '户型', '朝向', '交通配套', '城市',]
test = np.expand_dims(np.array([100000,120, 53, 2, 1, 5]),axis=0)
dot_data = t.export_graphviz(tree, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("tree.png")
……
y_pre = line.predict(x_test)
y_pre_tree = tree.predict(x_test)
ts = np.transpose(x, (1, 0))
coef = line.coef_
intercept = line.intercept_
……
corr = list(map(lambda x: list(x), [i for i in list(np.corrcoef(ts, y))]))
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False
#设置热力图的刻度间隔
plt.imshow(corr, cmap=plt.cm.hot, vmin=0, vmax=1)
#设置X和Y轴的标签
plt.xticks(ticks=[i for i in range(7)], labels=['面积均价', '面积', '户型', '朝向', '交通配套', '城市', '总价'])
plt.yticks(ticks=[i for i in range(7)], labels=['面积均价', '面积', '户型', '朝向', '交通配套', '城市', '总价'])
plt.colorbar()
plt.show() #图像展示
plt.figure()
for i in range(x.shape[1]):plt.subplot(3, 3, i + 1)plt.scatter(x[:, i], y)plt.plot(x[:, i], x[:, i] * coef[i] + intercept, color='r')plt.show()
二、测试数据及运行结果
(1)条形图
(城市间平均价格、中位数、标准差、平均面积如图1.1,平均每平方米价格如图1.2)
(2)城市间房价箱线图
(3)珠江三角洲部分区域小区数量饼状图
(4)朝向饼状图
(5)朝向平均价格及价格中位数条形图
(6)珠江三角洲部分区域小区数量的热力图
(7)珠江三角洲部分区域的房价条方图
(8)城市词云
(9)条方图
珠江三角洲部分区域平均房价,中位数,平均面积条方图
(10)预测售价线性回归图、预测售价热图
自定义一间平均房价为100000元,房屋面积120平方米,五室一厅四卫,东南朝向,附近有交通配套在深圳的房间数据,根据数据预测出来的结果如下
预测售价热图:
预测售价结果:线性回归算法预测售价1030万,决策树算法预测售价1443万,朴素贝叶斯算法预测售价968万
三、总结
经过这个学期的机器学习这门课程,了解了机器学习的算法和相关知识,并利用这学期所学知识初步完成此次项目的开发。
机器学习分为监督学习和无监督学习,监督学习这类算法是要知道我们将要预测什么,目标变量的信息。这类算法又分两部分,一是解决分类任务,比如:k-近邻算法、朴素贝叶斯算法、决策树、Logistic回归、SVM、AdaBoost元算法等;二是解决回归任务的,它主要预测数值型数据,比如:线性回归,当然也有和分类相结合的:树回归。
根据数据集及实际,例如本次使用的是线性回归预测房价机器学习的应用领域非常广泛,学习难度也比较大,说白了就是对于给定的数据进行处理,利用计算机的能力说到底,机器学习就是一种数据科学,要求一定的数学基础(线性代数、离散、高数这三种大学基本课程都必须掌握牢固),当然尽量提升自己的数学水平。实际上,所谓的机器学习,是面对一个具体的问题,从给定的数据中产生模型的算法,也就是说脱离了实际问题谈机器学习算法是毫无意义的。