[python]《天龙八部》文本处理

1、读取每章标题
2、统计人物出现的次数
3、将1和2输出为txt和excel
# 处理小说文本
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
import jieba
import re
from gensim import corpora,models,similarities
#读入数据文件
with open(r'D:\\天龙八部.txt', "r") as f:  # 打开文件data = f.read()  # 读取文件
#分割字符串
data = data.split('\n\n') #根据文本无用内容来进行分隔
data.remove('')  #处理后的文章在一个列表里面
for i in range(len(data)):data[i] = re.sub('萧峰','乔峰',data[i])  ##因为原文中萧峰和乔峰是同一个人,用正则化表达式将萧峰替换成乔峰data[i] = re.sub('慕容先生','慕容博',data[i])  #将慕容先生称呼替换成慕容博data[i] = re.sub('慕容公子','慕容复',data[i]) 
data_str = ''.join(data)  #以字符串方式储存
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
# 处理人物文本
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
with open(r'D:\\天龙八部人物名称.txt', "r", encoding = 'UTF-8') as f:  # 打开文件dataname = f.read()  # 读取文件
dataname = dataname.split('\u2002')  #根据无用内容分割字符串,处理后装在一个列表里面
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————# 统计章标题
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
biaotilist = [] #装标题名字
for i in range(len(data)):   #len(data)等于50biaotilist.append(re.findall(r'(.*)\n', data[i][:50])[0]) #运用正则化表达式提取出标题
print("天龙八部共%s章,标题如下："%len(biaotilist))
for j in range(len(biaotilist)):print(biaotilist[j])
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————print("\n")
# 小说的主角
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
for i in range(len(dataname)):dataname[i] = ''.join(re.findall('[\u4e00-\u9fa5]',dataname[i]))#运用正则化表达式来提取名字
data_name = dataname[:]#这是为了后面操作成字符串形式写的
#统计每个人出现的次数
for j in range(len(dataname)):num = 0    #计数器for i in range(len(data)):num = num + len(re.findall(dataname[j],data[i]))#正则化判断data中是否存在dataname中名字,有就+1dataname[j] = [str(dataname[j]), num] #变成二维列表形式
dataname=dict(dataname)#将二维列表变成字典
dataname=sorted(dataname.items(), key=lambda a: a[1], reverse=True) #按高到低顺序,按照列表中第二个元素排序,以字典形式储存      
print("人物以及人物出现的次数：")
for hang in range(169):for lie in range(2):print(dataname[hang][lie],end=" ")print(" ")
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
# 输出Excel和Txt文件
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
list1 = biaotilist
output = open('每章标题.xls','w',encoding='gbk')
output.write('章节\n')
for row in list1:rowtxt = "{} {}".format(row[0:4],row[6:len(row)])output.write(rowtxt)output.write('\n')
output.close()
list1 = biaotilist
output = open('每章标题.txt','w',encoding='gbk')
output.write('章节\n')
for row in list1:rowtxt = "{} {}".format(row[0:4],row[6:len(row)])output.write(rowtxt)output.write('\n')
output.close()
# ············································································································
list1=dataname
output=open('人物名及出现次数.xls','w',encoding='gbk')
output.write("人物名\t出现次数\n")
for i in range(len(list1)):for j in range(len(list1[i])):output.write(str(list1[i][j]))    #write函数不能写int类型的参数，所以使用str()转化output.write('\t')   #相当于Tab一下，换一个单元格output.write('\n')       #写完一行立马换行
output.close()
list1 = dataname
output = open('人物名及出现次数.txt','w',encoding='gbk')
output.write('人物名,出现次数\n')
for row in list1:rowtxt = '{},{}'.format(row[0],row[1])output.write(rowtxt)output.write('\n')
output.close()
# ————————————————————————————————————————————————————————————————————————————————————————————————————————————
[python]《天龙八部》文本处理

相关文章

天龙八部源码描述

天龙八部服务器端Lua脚本系统

天龙八部中的扫地僧的真实身份

天龙源码框架分析_天龙八部源码描述【转】

天龙八部中无名老僧的由来

天龙八部找怪物ID

天龙八部资源提取源代码

也谈《天龙八部》