再也不用花钱买漫画!Python爬取某漫画的脚本及源码

一、工具

python3
第三方类库requests
python3-pyqt5(gui依赖，不用gui可不装)

ubuntu系列系统使用以下命令安装依赖：

url格式: 漫画首页的url，如http://m.ac.qq.com/comic/view/id/518333(移动版) 或 http://ac.qq.com/comic/comicinfo/id/17114, (pc版)

注意: 火影忍者彩漫需要访问m.ac.qq.com搜索火影忍者，因为pc端页面火影忍者彩漫和黑白漫画是一个id一个url。

二、命令行帮助

usage: getcomic.py [-h] [-u url] [-p path] [-d] [-l list]
 
*下载腾讯漫画，仅供学习交流，请勿用于非法用途*
空参运行进入交互式模式运行。
 
optional arguments:
  -h, --help            show this help message and exit
  -u url, --url url     要下载的漫画的首页，可以下载以下类型的url: 
                        http://ac.qq.com/comic/comicinfo/id/511915
                        http://m.ac.qq.com/comic/comicinfo/id/505430
                        http://pad.ac.qq.com/comic/comicinfo/id/505430
                        http://ac.qq.com/naruto
  -p path, --path path  漫画下载路径。 默认: /home/fengyu/tencent_comic
  -d, --dir             将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)
  -l list, --list list  要下载的漫画章节列表，不指定则下载所有章节。格式范例: 
                        n - 下载具体某一章节，如-l 1, 下载第1章
                        n,n... - 下载某几个不连续的章节，如 "-l 1,3,5", 下载1,3,5章
                        n-n... - 下载某一段连续的章节，如 "-l 10-50", 下载[10,50]章
                        杂合型 - 结合上面所有的规则，如 "-l 1,3,5-7,11-111"

三、gui预览效果

支持不连续的章节选择下载

windows预览效果：

deepin/linux 预览效果：

四、全部源码

import requests
import re
import json
import os
import argparse
requestsession = requests.session()
ua = 'mozilla/5.0 (ipad; cpu os 5_1 like mac os x; en-us) \
applewebkit/534.46 (khtml, like gecko) version/5.1 \
mobile/9b176 safari/7534.48.3' # ipad ua
requestsession.headers.update({'user-agent': ua})
class errorcode(exception):
'''自定义错误码:
1: url不正确
2: url无法跳转为移动端url
3: 中断下载'''
def __init__(self, code):
self.code = code
def __str__(self):
return repr(self.code)
def islegelurl(url):
legal_url_list = [
re.compile(r'^http://ac.qq.com/comic/[cc]omicinfo/id/\d+/?$'),
re.compile(r'^http://m.ac.qq.com/comic/[cc]omicinfo/id/\d+/?$'),
re.compile(r'^http://ac.qq.com/\w+/?$'),
re.compile(r'^http://pad.ac.qq.com/comic/[cc]omicinfo/id/\d+/?$')
]
for legal_url in legal_url_list:
if legal_url.match(url):
return true
return false
def getid(url):
if not islegelurl(url):
print('请输入正确的url！具体支持的url请在命令行输入-h|--help参数查看帮助文档。')
raise errorcode(1)
numre = re.compile(r'\d+$')
id = numre.findall(url)
if not id:
get_id_request = requestsession.get(url)
url = get_id_request.url
id = numre.findall(url)
if not islegelurl(url) or not id:
print('无法自动跳转移动端url，请进入http://m.ac.qq.com，找到'
'该漫画地址。\n'
'地址应该像这样: '
'http://m.ac.qq.com/comic/comicinfo/id/xxxxx (xxxxx为整数)')
raise errorcode(2)
return id[0]
def getcontent(id):
getcomicinfourl = 'http://pad.ac.qq.com/getdata/getcomicinfo?id={}'.format(id)
requestsession.headers.update({'cookie': 'ac_refer=http://pad.ac.qq.com'})
requestsession.headers.update({'referer': 'http://pad.ac.qq.com'})
getcomicinfo = requestsession.get(getcomicinfourl)
comicinfojson = getcomicinfo.text
comicinfo = json.loads(comicinfojson)
comicname = comicinfo['title']
comicintrd = comicinfo['brief_intrd']
getchapterlisturl = 'http://pad.ac.qq.com/getdata/getchapterlist?id={}'.format(id)
getchapterlist = requestsession.get(getchapterlisturl)
contentjson = json.loads(getchapterlist.text)
count = contentjson['length']
sortedcontentlist = []
for i in range(count + 1):
for item in contentjson:
if isinstance(contentjson[item], dict) and contentjson[item].get('seq') == i:
sortedcontentlist.append({item: contentjson[item]})
break
return (comicname, comicintrd, count, sortedcontentlist)
def getimglist(contentjson, id):
cid = list(contentjson.keys())[0]
getpichashurl = 'http://pad.ac.qq.com/view/mgetpichash?id={}&cid={}'.format(id, cid)
picjsonpage = requestsession.get(getpichashurl).text
picjson = json.loads(picjsonpage)
count = picjson['pcount']    #统计图片数量
phash = picjson['phash']
sortedimgdictlist = []
for i in range(1, count + 1):
for item in phash:
if phash[item]['seq'] == i:
sortedimgdictlist.append(phash[item])
break
imglist = []
for imgdict in sortedimgdictlist:
k = imgdict['cid']
m = imgdict['pid']
j = int(id)
uin = max(j + k + m, 10001)
l = [j % 1000 // 100, j % 100, j, k]
n = '/mif800/' + '/'.join(str(j) for j in l) + '/'
h = str(m) + '.mif2'
g="http://ac.tc.qq.com/store_file_download?buid=15017&uin="+str(uin)+"&dir_path="+n+"&name="+h
imglist.append(g)
return imglist
def downloadimg(imgurllist, contentpath, one_folder=false):
count = len(imgurllist)
print('该集漫画共计{}张图片'.format(count))
i = 1
for imgurl in imgurllist:
print('\r正在下载第{}张图片...'.format(i), end = '')
if not one_folder:
imgpath = os.path.join(contentpath, '{0:0>3}.jpg'.format(i))
else:
imgpath = contentpath + '{0:0>3}.jpg'.format(i)
i += 1
#目标文件存在就跳过下载
if os.path.isfile(imgpath):
continue
try:
downloadrequest = requestsession.get(imgurl, stream=true)
with open(imgpath, 'wb') as f:
for chunk in downloadrequest.iter_content(chunk_size=1024): 
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
except (keyboardinterrupt, systemexit):
print('\n\n中断下载，删除未下载完的文件！')
if os.path.isfile(imgpath):
os.remove(imgpath)
raise errorcode(3)
print('完毕!\n')
def parselist(lst):
'''解析命令行中的-l|--list参数，返回解析后的章节列表'''
legallistre = re.compile(r'^\d+([,-]\d+)*$')
if not legallistre.match(lst):
raise listformaterror(lst + ' 不匹配正则: ' + r'^\d+([,-]\d+)*$')
#先逗号分割字符串，分割后的字符串再用短横杠分割
parsedlist = []
sublist = lst.split(',')
numre = re.compile(r'^\d+$')
for sub in sublist:
if numre.match(sub):
if int(sub) > 0: #自动忽略掉数字0
parsedlist.append(int(sub))
else:
print('警告: 参数中包括不存在的章节0，自动忽略')
else:
splitnum = list(map(int, sub.split('-')))
maxnum = max(splitnum)
minnum = min(splitnum)       #min-max或max-min都支持
if minnum == 0:
minnum = 1               #忽略数字0
print('警告: 参数中包括不存在的章节0，自动忽略')
parsedlist.extend(range(minnum, maxnum+1))
parsedlist = sorted(set(parsedlist)) #按照从小到大的顺序排序并去重
return parsedlist
def main(url, path, lst=none, one_folder=false):
'''url: 要爬取的漫画首页。 path: 漫画下载路径。 lst: 要下载的章节列表(-l|--list后面的参数)'''
try:
if not os.path.isdir(path):
os.makedirs(path)
id = getid(url)
comicname,comicintrd,count,contentlist = getcontent(id)
contentnamelist = []
for item in contentlist:
for k in item:
contentnamelist.append(item[k]['t'])
print('漫画名: {}'.format(comicname))
print('简介: {}'.format(comicintrd))
print('章节数: {}'.format(count))
print('章节列表:')
try:
print('\n'.join(contentnamelist))
except exception:
print('章节列表包含无法解析的特殊字符\n')
forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |
comicname = re.sub(forbiddenre, '_', comicname) #将windows下的非法字符一律替换为_
comicpath = os.path.join(path, comicname)
if not os.path.isdir(comicpath):
os.makedirs(comicpath)
print()
if not lst:
contentrange = range(1, len(contentlist) + 1)
else:
contentrange = parselist(lst)
for i in contentrange:
if i > len(contentlist):
print('警告: 章节总数 {} ,'
'参数中包含过大数值,'
'自动忽略'.format(len(contentlist)))
break
contentnamelist[i - 1] = re.sub(forbiddenre, '_', contentnamelist[i - 1]) #将windows下的非法字符一律替换为_
contentpath = os.path.join(comicpath, '第{0:0>4}话-{1}'.format(i, contentnamelist[i - 1]))
try:
print('正在下载第{0:0>4}话: {1}'.format(i, contentnamelist[i -1]))
except exception:
print('正在下载第{0:0>4}话: {1}'.format(i))
if not one_folder:
if not os.path.isdir(contentpath):
os.mkdir(contentpath)
imglist = getimglist(contentlist[i - 1], id)
downloadimg(imglist, contentpath, one_folder)
except errorcode as e:
exit(e.code)
if __name__ == '__main__':
defaultpath = os.path.join(os.path.expanduser('~'), 'tencent_comic')
parser = argparse.argumentparser(formatter_class=argparse.rawtexthelpformatter,
description='*下载腾讯漫画，仅供学习交流，请勿用于非法用途*\n'
'空参运行进入交互式模式运行。')
parser.add_argument('-u', '--url', help='要下载的漫画的首页，可以下载以下类型的url: \n'
'http://ac.qq.com/comic/comicinfo/id/511915\n'
'http://m.ac.qq.com/comic/comicinfo/id/505430\n'
'http://pad.ac.qq.com/comic/comicinfo/id/505430\n'
'http://ac.qq.com/naruto')
parser.add_argument('-p', '--path', help='漫画下载路径。 默认: {}'.format(defaultpath), 
default=defaultpath)
parser.add_argument('-d', '--dir', action='store_true', help='将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)')
parser.add_argument('-l', '--list', help=("要下载的漫画章节列表，不指定则下载所有章节。格式范例: \n"
"n - 下载具体某一章节，如-l 1, 下载第1章\n"
'n,n... - 下载某几个不连续的章节，如 "-l 1,3,5", 下载1,3,5章\n'
'n-n... - 下载某一段连续的章节，如 "-l 10-50", 下载[10,50]章\n'
'杂合型 - 结合上面所有的规则，如 "-l 1,3,5-7,11-111"'))
args = parser.parse_args()
url = args.url
path = args.path
lst = args.list
one_folder = args.dir
if lst:
legallistre = re.compile(r'^\d+([,-]\d+)*$')
if not legallistre.match(lst):
print('list参数不合法，请参考--help键入合法参数！')
exit(1)
if not url:
url = input('请输入漫画首页地址: ')
path = input('请输入漫画保存路径(默认: {}): '.format(defaultpath))
if not path:
path = defaultpath
main(url, path, lst, one_folder)

五、下载源码

from pyqt5.qtcore import *
from pyqt5.qtwidgets import *
from pyqt5.qtgui import *
import getcomic
import os
import re
import sys
class tencentcomicdownloader(qwidget):
def __init__(self, parent=none):
super(tencentcomicdownloader, self).__init__(parent)
namelabel = qlabel("漫画首页:")
self.nameline = qlineedit()
self.analysisbutton = qpushbutton("分析")
self.analysisbutton.clicked.connect(self.anaysisurl)
self.nameline.returnpressed.connect(self.analysisbutton.click)
pathlinelabel = qlabel("下载路径:")
self.pathline = qlineedit()
defaultpath = os.path.join(os.path.expanduser('~'), 'tencent_comic')
self.pathline.settext(defaultpath)
self.browsebutton = qpushbutton("浏览")
self.browsebutton.clicked.connect(self.getpath)
comicnamelabel = qlabel("漫画名: ")
self.comicnamelabel = qlabel("暂无")
self.one_folder_checkbox = qcheckbox("单目录")
comicintrolabel = qlabel("简介: ")
self.comicintro = qlabel("暂无")
self.comicintro.setwordwrap(true)
chaptergroupbox = qgroupbox("章节列表:")
self.chapterlistview = qlistwidget(chaptergroupbox)
self.chapterlistview.setselectionmode(qabstractitemview.extendedselection)
self.chapterlistview.setenabled(false)
groupboxlayout = qhboxlayout(chaptergroupbox)
groupboxlayout.addwidget(self.chapterlistview)
self.downloadbutton = qpushbutton("下载选中")
self.statuslabel = qlabel("输入要下载的漫画的首页，然后点分析")
self.statuslabel.setwordwrap(true)
self.downloadbutton.setenabled(false)
self.downloadbutton.clicked.connect(self.download)
mainlayout = qgridlayout()
mainlayout.addwidget(namelabel, 0, 0)
mainlayout.addwidget(self.nameline, 0, 1)
mainlayout.addwidget(self.analysisbutton, 0, 2)
mainlayout.addwidget(pathlinelabel, 1, 0)
mainlayout.addwidget(self.pathline, 1, 1)
mainlayout.addwidget(self.browsebutton, 1, 2)
mainlayout.addwidget(comicnamelabel, 2, 0)
mainlayout.addwidget(self.comicnamelabel, 2, 1, 1, 2)
mainlayout.addwidget(self.one_folder_checkbox, 2, 2)
mainlayout.addwidget(comicintrolabel, 3, 0)
mainlayout.addwidget(self.comicintro, 3, 1, 1, 2)
mainlayout.addwidget(chaptergroupbox, 4, 0, 1, 3)
mainlayout.addwidget(self.downloadbutton, 5, 2)
mainlayout.addwidget(self.statuslabel, 5, 0, 1, 2)
self.setlayout(mainlayout)
self.setwindowtitle("腾讯漫画下载")
self.setgeometry(400, 300, 800, 500)
def setstatus(self, status):
self.statuslabel.settext(status)
def enablewidget(self, enable):
widgets_list = [
self.downloadbutton,
self.nameline,
self.pathline,
self.chapterlistview,
self.analysisbutton,
self.browsebutton,
self.one_folder_checkbox
]
for widget in widgets_list:
widget.setenabled(enable)
if enable:
self.downloadbutton.settext('下载选中')
self.chapterlistview.setfocus()
def getpath(self):
path = str(qfiledialog.getexistingdirectory(self, "选择下载目录"))
if path:
self.pathline.settext(path)
def anaysisurl(self):
url = self.nameline.text()
self.downloadbutton.setenabled(false)
self.comicnamelabel.settext("暂无")
self.comicintro.settext("暂无")
self.chapterlistview.clear()
self.chapterlistview.setenabled(false)
try:
if getcomic.islegelurl(url):
self.id = getcomic.getid(url)
self.comicname,self.comicintrd,self.count,self.contentlist = getcomic.getcontent(self.id)
self.contentnamelist = []
for item in self.contentlist:
for k in item:
self.contentnamelist.append(item[k]['t'])
self.comicnamelabel.settext(self.comicname)
self.comicintro.settext(self.comicintrd)
self.chapterlistview.setenabled(true)
self.downloadbutton.setenabled(true)
self.chapterlistview.setfocus()
self.statuslabel.settext('选择要下载的章节后点击右侧按钮')
for i in range(len(self.contentnamelist)):
self.chapterlistview.additem('第{0:0>4}话-{1}'.format(i+1, self.contentnamelist[i]))
self.chapterlistview.item(i).setselected(true)
self.downloadbutton.setenabled(true)
else:
self.statuslabel.settext('<font color="red">错误的url格式！请输入正确的漫画首页地址！</font>')
except getcomic.errorcode as e:
if e.code == 2:
self.statuslabel.settext('<font color="red">无法跳转为移动端url,请进入http://m.ac.qq.com找到该漫画地址</font>')
except keyerror:
self.statuslabel.settext('<font color="red">不存在的地址</font>')
def download(self):
self.downloadbutton.settext("下载中...")
one_folder = self.one_folder_checkbox.ischecked()
self.enablewidget(false)
selectedchapterlist = [ item.row() for item in self.chapterlistview.selectedindexes() ]
path = self.pathline.text()
comicname = self.comicname
forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |
comicname = re.sub(forbiddenre, '_', comicname) #将windows下的非法字符一律替换为_
comicpath = os.path.join(path, comicname)
if not os.path.isdir(comicpath):
os.makedirs(comicpath)
self.downloadthread = downloader(selectedchapterlist, comicpath, self.contentlist, self.contentnamelist, self.id, one_folder)
self.downloadthread.output.connect(self.setstatus)
self.downloadthread.finished.connect(lambda: self.enablewidget(true))
self.downloadthread.start()
class downloader(qthread):
output = pyqtsignal(['qstring'])
finished = pyqtsignal()
def __init__(self, selectedchapterlist, comicpath, contentlist, contentnamelist, id, one_folder=false, parent=none):
super(downloader, self).__init__(parent)
self.selectedchapterlist = selectedchapterlist
self.comicpath = comicpath
self.contentlist = contentlist
self.contentnamelist = contentnamelist
self.id = id
self.one_folder = one_folder
def run(self):
try:
for i in self.selectedchapterlist:
outputstring = '正在下载第{0:0>4}话: {1}...'.format(i+1, self.contentnamelist[i])
print(outputstring)
self.output.emit(outputstring)
forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |
self.contentnamelist[i] = re.sub(forbiddenre, '_', self.contentnamelist[i])
contentpath = os.path.join(self.comicpath, '第{0:0>4}话-{1}'.format(i+1, self.contentnamelist[i]))
if not self.one_folder:
if not os.path.isdir(contentpath):
os.mkdir(contentpath)
imglist = getcomic.getimglist(self.contentlist[i], self.id)
getcomic.downloadimg(imglist, contentpath, self.one_folder)
self.output.emit('完毕!')
except exception as e:
self.output.emit('<font color="red">{}</font>\n'
'遇到异常!请尝试重新点击下载按钮重试'.format(e))
raise
finally:
self.finished.emit()
if __name__ == '__main__':
app = qapplication(sys.argv)
main = tencentcomicdownloader()
main.show()
app.exec_()

到此这篇关于再也不用花钱买漫画!python下载某漫画的脚本及源码的文章就介绍到这了,更多相关python下载漫画内容请搜索www.887551.com以前的文章或继续浏览下面的相关文章希望大家以后多多支持www.887551.com！

黄山市民网：https://www.huangshanshimin.com/