使用python制作epub
- 前期工作
- 开始制作
- 第一步、分析网站
- 第二步、抓取数据并清洗
- 获取文章内容
- 第三步、保存到Epub中
- 全部代码
前期工作
- ebookLib库
- 关于该库,如果pypi版本太低,需要去gitlab上clone,然后运行python setup.py install
- zhconv库,主要用来简繁转换
- 一个允许抓取数据的小说网站novel-backup
- 一点点时间
开始制作
第一步、分析网站
根据自己抓取的网站,获取所有章节的链接
https://novels.novel-backup.cf/index/1558018541.json
根据获得的内容,对内容进行分析
(已省略部分数据)
[{"name": "41.成套的葡萄酒杯","id": 7460},{"name": "42.烤肉午餐","id": 7550}
]
里面的id就是下面章节内容的链接xx/yy.json的yy
再获取章节内容,对其内容进行分析
https://novels.novel-backup.cf/novels/93065.json
(已省略部分数据)
{"code_id": 1558018541,"title": "第1卷插圖","create_date": "2020-10-07 20:51:33","content": "<p><img src=\"https://live.staticflickr.com/65535/50431755246_afecb655fc_o.png[/img][/url][url=https://flic.kr/p/2jQtVPu]魔導具師ダリヤはうつむかない 1-0[/url] by [url=https://www.flickr.com/photos/55799173@N00/]jameslam518[/url], on Flickr\" class=\"fr-fic fr-dib\"></p><p><br></p>","author": "職業量地官","views": 2896
}
对于我们来说,有用的是title、content、author
第二步、抓取数据并清洗
ebookLib的章节顺序是按照add_item来排的,所以我们需要对抓取的章节进行排序。
首先新建一个py文件,然后新建一个类Espider
def getJson(self,url):html:requests.Response= requests.get(url)return html.json()def getDictList(self,url):js:typing.List[dict]=self.getJson(url)return jsdef getFilter(self,li_list):maxx=0id_dicts=[]for li in li_list:idict=liidict['name']=convert(idict['name'],'zh-hans')ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])if(len(ll)>0):s:str=ll[0]num=int(s[:-1])idict['num']=nummaxx=max(maxx,num)else:ll=re.findall(r'第([1-9]\d*)话',idict['name'])if(len(ll)>0):s:str=ll[0]num=int(s)idict['num']=nummaxx=max(num,maxx)else:maxx+=1idict['num']=maxxid_dicts.append(idict)id_dicts.sort(key=lambda it:it['num'])tmp_list:typing.List[dict]=[]for i in range(len(id_dicts)):id_dicts[i]['i']=str(i)tmp_list.append(id_dicts[i])return tmp_list
首先是获取数据,然后将数据转换格式(getJson
,getDictList
)
getFilter
长长的代码简单理解就是将章节的链接List中每个174. 疲勞與真心話或第3話 商業公會前面的数字取出来,然后如果有不存在数字的章节,就让这个章节的id=maxx
获取文章内容
对于图片需要特殊处理,先保存到本地后再添加到epub文件里
def getDict(self,url):js:dict=self.getJson(url)return jsdef saveImg(self,title,src):path='Images/{}'.format(title)if(os.path.exists(path)==False):os.mkdir(path)s=re.findall(r'65535/(.*?)\[/img\]',src)if(len(s)==0):s=re.findall(r'65535/(.*?.png)',src)[0]else:s=s[0]res:requests.Response=requests.get(src,stream=True)res.raise_for_status()with open("{}/{}".format(path,s),"wb") as f:f.write(res.content)self.img_list.append({'src':"{}/{}".format(path,s),'uid':s.split('.')[0]})return "{}/{}".format(path,s)def contentCheck(self,title,content:str):soup=BeautifulSoup(content,'lxml')for img in soup.findAll('img'):s=self.saveImg(title,img['src'])img['src']=sreturn str(soup.body)def getContent(self,id):url_s='https://novels.novel-backup.cf/novels/'url_e='.json'print(url_s+id+url_e)js=self.getDict(url_s+id+url_e)js['author']=convert(js['author'],'zh-hans')js['title']=convert(js['title'],'zh-hans')js['content']=convert(js['content'],'zh-hans')return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])
getDict
是获取数据,然后getContent
对js
取出属性,使用contentCheck
对内容处理,之后将其保存到List
中。
第三步、保存到Epub中
新建一个ebook文件,将eooklib和Espider引入
toc = []
spine = ['nav']
book = epub.EpubBook()chp_list = []def init(title, author):# set metadatabook.set_identifier('id123456')book.set_title(title)book.set_language('cn')book.add_author(author)book.add_author('Anonymous', file_as='Anonymous',role='ill', uid='coauthor')# add default NCX and Nav filebook.add_item(epub.EpubNcx())book.add_item(epub.EpubNav())# define CSS stylestyle = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)# add CSS filebook.add_item(nav_css)def saveChapter():c1=getChapter('前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')book.add_item(c1)toc.append(epub.Link(c1.file_name,c1.title,c1.title))spine.append(c1)for it in chp_list:# For each chapter add chapter to the book, TOC and spinebook.add_item(it['chapter'])toc.append(epub.Link(it['chapter'].file_name,it['chapter'].title, it['chapter'].title))spine.append(it['chapter'])def saveImage(img_list:typing.List[dict]):for img in img_list:image_content = open(img['src'], 'rb').read()img = epub.EpubImage(uid=img['uid'], file_name=img['src'],media_type='image/png', content=image_content)book.add_item(img)def saveEpub(file_name):# define Table Of Contentsbook.toc = tuple(toc)# basic spinebook.spine = spine# write to the fileepub.write_epub('epub/'+file_name, book, {})def getChapter(title, content, id):c1 = epub.EpubHtml(title=title,file_name='chap_'+id+'.xhtml', lang='hr')c1.content = '<h1>'+title+'<h1>'+contentreturn c1def poChapter(it, llen):i = int(it['i'])+1c = getChapter(it['name'], es.getContent(str(it['id'])), str(i).zfill(llen))chp_list.append({'chapter': c,'id': i})if __name__ == '__main__':init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')es = Espider()li_url = 'https://novels.novel-backup.cf/index/1558018541.json'li_list = es.getDictList(li_url)id_dicts = es.getFilter(li_list)llen = len(str(len(id_dicts)))# poChapter(id_dicts[0],llen)# 创建线程index = [i for i in range(0, len(id_dicts), 4)]threads = []for i in index:for j in range(0, 4):threads.append(threading.Thread(target=poChapter, args=(id_dicts[i+j], llen)))for t in threads:t.start()for t in threads:t.join()print('Main thread has ended!')chp_list.sort(key=lambda it: it['id'])saveChapter()saveImage(es.img_list)saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')
init来自ebookLib官方文档给出的函数,str(i).zfill(llen)是对数字进行数位补齐,如’chap_002.xhtml’
引入threading是为了在爬虫的时候进行多线程,提高效率。
全部代码
# spider.py
import requests
from bs4 import BeautifulSoup
import typing
import re
import os
from zhconv import convertclass Espider:
# https://novels.novel-backup.cf/index/1558018541.json# https://novels.novel-backup.cf/novels/7460.jsonimg_list=[]def getJson(self,url):html:requests.Response= requests.get(url)# soup = BeautifulSoup(html.json())return html.json()def getDict(self,url):js:dict=self.getJson(url)# print(js)return jsdef getDictList(self,url):js:typing.List[dict]=self.getJson(url)# print(js)return jsdef saveImg(self,title,src):path='Images/{}'.format(title)if(os.path.exists(path)==False):os.mkdir(path)# print(src)s=re.findall(r'65535/(.*?)\[/img\]',src)# print(s)if(len(s)==0):s=re.findall(r'65535/(.*?.png)',src)[0]else:s=s[0]# print(s)res:requests.Response=requests.get(src,stream=True)res.raise_for_status()with open("{}/{}".format(path,s),"wb") as f:f.write(res.content)self.img_list.append({'src':"{}/{}".format(path,s),'uid':s.split('.')[0]})return "{}/{}".format(path,s)def contentCheck(self,title,content:str):soup=BeautifulSoup(content,'lxml')# print(soup)for img in soup.findAll('img'):s=self.saveImg(title,img['src'])img['src']=s# ''.join(str(it) for it in soup.find_all('p'))return str(soup.body)def getContent(self,id):url_s='https://novels.novel-backup.cf/novels/'url_e='.json'print(url_s+id+url_e)js=self.getDict(url_s+id+url_e)js['author']=convert(js['author'],'zh-hans')js['title']=convert(js['title'],'zh-hans')js['content']=convert(js['content'],'zh-hans')# print(js['author'],js['title'],js['content'])return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])def getFilter(self,li_list):maxx=0id_dicts=[]for li in li_list:idict=liidict['name']=convert(idict['name'],'zh-hans')ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])if(len(ll)>0):s:str=ll[0]num=int(s[:-1])idict['num']=nummaxx=max(maxx,num)else:ll=re.findall(r'第([1-9]\d*)话',idict['name'])if(len(ll)>0):s:str=ll[0]num=int(s)idict['num']=nummaxx=max(num,maxx)else:maxx+=1idict['num']=maxxid_dicts.append(idict)id_dicts.sort(key=lambda it:it['num'])tmp_list:typing.List[dict]=[]for i in range(len(id_dicts)):id_dicts[i]['i']=str(i)tmp_list.append(id_dicts[i])return tmp_listdef getIdList(self,li_list):id_list:typing.List[str]=[str(it['id']) for it in li_list]return id_listif __name__=="__main__":print("爬取开始")# po=pool.Pool(5)# li_url='https://novels.novel-backup.cf/index/1558018541.json'es=Espider()# li_list=es.getDictList(li_url);# # print(li_list)# id_dicts=es.getFilter(li_list)# print(id_dicts)print(es.getContent('112353'))print(es.getContent('16733'))# print(es.img_list)print('爬取结束')# ebook.py
import threading
import typing
from ebooklib import epub
from spider import Espidertoc = []
spine = ['nav']
book = epub.EpubBook()chp_list = []def init(title, author):# set metadatabook.set_identifier('id123456')book.set_title(title)book.set_language('cn')book.add_author(author)book.add_author('Anonymous', file_as='Anonymous',role='ill', uid='coauthor')# add default NCX and Nav filebook.add_item(epub.EpubNcx())book.add_item(epub.EpubNav())# define CSS stylestyle = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)# add CSS filebook.add_item(nav_css)def saveChapter():c1=getChapter('前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')book.add_item(c1)toc.append(epub.Link(c1.file_name,c1.title,c1.title))spine.append(c1)for it in chp_list:# For each chapter add chapter to the book, TOC and spinebook.add_item(it['chapter'])toc.append(epub.Link(it['chapter'].file_name,it['chapter'].title, it['chapter'].title))spine.append(it['chapter'])# print('save c', chapter.file_name)def saveImage(img_list:typing.List[dict]):for img in img_list:image_content = open(img['src'], 'rb').read()img = epub.EpubImage(uid=img['uid'], file_name=img['src'],media_type='image/png', content=image_content)book.add_item(img)def saveEpub(file_name):# define Table Of Contentsbook.toc = tuple(toc)# basic spinebook.spine = spine# write to the fileepub.write_epub('epub/'+file_name, book, {})def getChapter(title, content, id):c1 = epub.EpubHtml(title=title,file_name='chap_'+id+'.xhtml', lang='hr')c1.content = '<h1>'+title+'<h1>'+contentprint("g", c1.file_name, c1.title, id)return c1def poChapter(it, llen):# print("开始进程", it['i'])i = int(it['i'])+1c = getChapter(it['name'], es.getContent(str(it['id'])), str(i).zfill(llen))chp_list.append({'chapter': c,'id': i})# saveChapter(c, it['i'])if __name__ == '__main__':init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')es = Espider()li_url = 'https://novels.novel-backup.cf/index/1558018541.json'li_list = es.getDictList(li_url)id_dicts = es.getFilter(li_list)llen = len(str(len(id_dicts)))# poChapter(id_dicts[0],llen)# 创建线程index = [i for i in range(0, len(id_dicts), 4)]threads = []for i in index:for j in range(0, 4):threads.append(threading.Thread(target=poChapter, args=(id_dicts[i+j], llen)))for t in threads:t.start()for t in threads:t.join()print('Main thread has ended!')chp_list.sort(key=lambda it: it['id'])saveChapter()# es.img_list.append('Images/第6卷插圖/51154283631_826ee93727_o.png')saveImage(es.img_list)saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')