python制作微博图片爬取工具

有小半个月没有发博客了，因为一直在研究python的gui，买了一本书学习了一些基础，用我所学做了我的第一款gui——微博图片爬取工具。本软件源代码已经放在了博客中，另外软件已经打包好上传到网盘中以供下载学习。

一．准备工作

本次要用到以下依赖库：re json os random tkinter threading requests pil 其中后两个需要安装后使用

二．预览

1.启动

2.运行中

3.结果

这里只将拿一张图片作为展示。

三．设计流程

设计流程分为总体设计和详细设计，这里我会使用viso画出几个流程图，用以展示我的思路，其中详细设计部分，我列举了两个函数实现的具体流程。

1.总体设计

此图为整个系统的整体流程也是本gui软件的使用过程。

2.详细设计

在此列举两个函数一个是搜索按钮触发的wb_search函数，一个是开始爬取按钮触发的wb_pics_parse函数。

2.1wb_search函数

2.2wb_pics_parse函数

四．源代码

import json
import random
import re
import os
from tkinter import *
from tkinter import messagebox
from tkinter import ttk
import requests
import threading
from pil import image,imagetk
"""
1.07使用check button 实现下载完打开文件夹操作，注册了enter、esc热键，优化了一些体验
1.08 1.更新了关键字、磁盘、用户判断逻辑
2.将之前的线程池改为多线程来执行下载操作
1.13说明：如果在下载过程变慢，可能是软件正在解析图片地址或者就是您的网络不行
"""
class weibo_pics_spider(object):
def __init__(self,start_url):
self.start_url=start_url
#解析出图片地址
def get_pics_url(self):
i = 1
global a_flag
a_flag = true
while true:
url = self.start_url + '&page={}'.format(i)
headers = {'user-agent': get_ua()}
r = requests.get(url, headers=headers)
_json = json.loads(r.text)
items = _json["data"]["cards"]
flag = _json['ok']
if flag == 1 and a_flag: # 爬取数据标志+一个手动控制标志
for v in items:
picslist = v.get('mblog')
if picslist is not none:
img_urls = picslist.get('pics')
if img_urls != none:
for img_url_ in img_urls:
img_url = img_url_['large']['url']
yield img_url
else:
#1.06页数显示出现问题
t1.insert(end, f'***在第{i}页终止***\n')
t1.see(end)
t1.update()
if r1_var.get() == 1:
big_dir=disk+':/weibo_pics'
os.startfile(big_dir)
break
i += 1
#下载图片
def download_pics(self,url,filename):
headers={'user-agent': get_ua()}
r = requests.get(url, headers=headers)
big_dir=disk+':/weibo_pics'
aim_path=big_dir+'/'+user_name_selected
try:
os.makedirs(aim_path)
except:
pass
with open(aim_path + '\\' + filename, 'wb')as f:
f.write(r.content)
# 保证焦点始终在最下
t1.see(end)
# 下载完一张刷新一次 防止界面卡死崩溃
t1.insert(end, f'{filename}\n')
window.update()
def get_ua():
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(windows nt 6.1; wow64)', '(windows nt 10.0; wow64)', '(x11; linux x86_64)',
'(macintosh; intel mac os x 10_12_6)'
]
chrome_version = 'chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
ua = ' '.join(['mozilla/5.0', random.choice(os_type), 'applewebkit/537.36',
'(khtml, like gecko)', chrome_version, 'safari/537.36']
)
return ua
def wb_search():
#先清空lsibox1内容，便于新内容显示
listb1.delete(0,end)
url1='https://m.weibo.cn/api/container/getindex?containerid=100103type%3d3%26q%3d{}%26t%3d0'
headers={'user-agent':get_ua()}
key_word = e1.get()
global user_id_list
user_id_list=list()
if len(key_word)!=0:
#若用户输入了user_id，则去获取screen_name
if re.match('\d{10}',key_word):
user_id_list.append(key_word)
url2 = f'https://m.weibo.cn/api/container/getindex?uid={key_word}&containerid=100505{key_word}'
r1 = requests.get(url2, headers=headers)
_data = json.loads(r1.text)
screen_name = _data['data']['userinfo'].get('screen_name')
l3.place(x=120, y=42)
l3_var.set(f'搜索成功')
l3['background'] = 'green'
listb1.insert(end, screen_name)
#否则根据关键字去搜索用户信息，显示在listbox中
else:
aim_url=url1.format(key_word)
r=requests.get(aim_url,headers=headers)
_json=json.loads(r.text)
try:
#若出现了indexerror则表明没有检索到用户信息
users=_json['data']['cards'][1].get('card_group')
relevant_num=len(users)
l3.place(x=105, y=42)
l3_var.set(f'搜索到了 {relevant_num} 个用户')
l3['background']='green'
for user_ in users:
user_info=user_.get('user')
user_name=user_info.get('screen_name')
id = user_info.get('id')
"""
1.02的一种思路，使用一个列表存储screen_name和uid，两者用;(自定义字符，但应避免较少冲突)
当获取uid时，直接切割字符串，取listbox所选项索引，按索引在列表表值（uid）
#使用字符串拼接 格式：screen_name+';'+str(id)
# user_data = user_name + ';' + str(id)
"""
user_id_list.append(id)
listb1.insert(end,user_name)
except indexerror:#如果没有检索到用户，就会报列表索引错误
messagebox.showinfo(title='提示', message='没有检索到相关用户，请更换关键字或使用用户id搜索！')
l3.place(x=85, y=42)
l3_var.set(f'请更换关键字或用户id搜索！')
l3['background']='yellow'
#没有检索到用户的话，提示之后，e1获得焦点之后，清除用户之前输入
e1.bind('wm_take_focus', e1_clear())
else:#处理没有输入关键字
messagebox.showinfo(title='info',message='请输入关键字！')
l3.place(x=110, y=42)
l3_var.set(f'请输入关键字！')
l3['background'] = 'red'
def wb_pics_parse():
key_word=e1.get()
select_path=c1.get()
#1.先判断关键字是否输入
if len(key_word)!=0:
#2.再判断是否选择了磁盘
if len(select_path)==1:
#3.判断所选路径是否存在
if not os.path.exists(select_path):
#4.判断是否在列表框选择了用户名
try:
# 直接获取选中项目
"""1.05获取listbox user_name_selected真费劲"""
global user_name_selected
user_name_selected=listb1.get(listb1.curselection())
user_name_index = listb1.curselection()[0]
user_id = user_id_list[user_name_index]
container_id = '107603' + str(user_id)
start_url = f'https://m.weibo.cn/api/container/getindex?containerid={container_id}'
spider = weibo_pics_spider(start_url)
t1.config(state='normal') # 将text开启，置为可读可写状态
l3.place(x=120, y=42)
l3_var.set(f'正在运行......')
l3['background'] = 'green'
for pic_url in spider.get_pics_url():
filename = pic_url.split('/')[-1]
# 字符串切割，切割出前10个字符串
filename = filename[10:]
thread_it(spider.download_pics,pic_url,filename)
#搜索后，但是没选择用户，会报tclerror错误，此except就用来捕获这个异常
except tclerror:
messagebox.showwarning(title='警告', message='请选择一个用户！')
l3.place(x=105, y=42)
l3_var.set(f'请选择一个用户！')
l3['background'] = 'red'
#获取当前选中项目(使用索引)
else:
messagebox.showwarning(title='警告',message='请检查路径！')
l3.place(x=80, y=42)
l3_var.set(f'请检查路径！')
l3['background'] = 'red'
else:
messagebox.showwarning(title='警告', message='您未选择磁盘!')
l3.place(x=85, y=42)
l3_var.set(f'请检查是否选择了磁盘！')
l3['background'] = 'red'
else:
messagebox.showwarning(title='警告', message='请输入关键字！')
l3.place(x=110, y=42)
l3_var.set(f'请输入关键字！')
l3['background'] = 'red'
def open_disk():
disk=c1.get()
big_dir=disk+':/weibo_pics'
if len(disk)==1:
try:
if not os.path.exists(big_dir):
os.mkdir(big_dir)
os.startfile(big_dir)
except:
messagebox.showwarning(title='警告',message='选中的磁盘不存在！')
l3.place(x=110, y=42)
l3_var.set(f'选中的磁盘不存在！')
l3['background'] = 'red'
else:
messagebox.showwarning(title='警告', message='您未选中磁盘！')
l3.place(x=115, y=42)
l3_var.set(f'您未选中磁盘！')
l3['background'] = 'red'
def window_quit():
ret=messagebox.askyesno(title='提示',message='是否要退出？')
if ret==true:
window.destroy()
window.quit()
def e1_clear():
e1.delete(0,end)
def print_path(event):
#要使用完整的路径
global disk
disk = c1.get()
disk_path=c1.get()+':/'
if len(disk)==1:
if os.path.exists(disk_path):
messagebox.showinfo(title='提示',message=f'文件将存储到：{disk}:/weibo_pics目录下')
else:
messagebox.showerror(title='错误',message='选定磁盘不存在!')
l3.place(x=100, y=42)
l3_var.set(f'选中的磁盘不存在！')
l3['background'] = 'red'
else:
messagebox.showwarning(title='警告', message='请先选定磁盘！')
l3.place(x=120, y=42)
l3_var.set(f'请先选定磁盘！')
l3['background'] = 'red'
def switch():
if r1_var.get()==0:
r1_var.set(1)
else:
r1_var.set(0)
def escape(event):
window_quit()
def enter(event):
wb_search()
'''解决程序卡死的重要方法，避免子线程和ui线程在同一个线程'''
def thread_it(func, *args):
'''将函数打包进线程'''
# 创建
t = threading.thread(target=func, args=args)
# 守护 !!!
t.setdaemon(true)
# 启动
t.start()
# 阻塞--卡死界面！
# t.join()
window=tk()
width=310
height=395
screenwidth = window.winfo_screenwidth() # 获取显示区域的宽度
screenheight = window.winfo_screenheight() # 获取显示区域的高度
left = (screenwidth - width) / 2
top = (screenheight - height) / 2
window.geometry("%dx%d+%d+%d" % (width, height, left, top))
window.resizable(0,0)
window.title('微博图片采集工具-v1.08')
#设置图标
ico_path=r'./rely/icon.ico'
window.iconbitmap(ico_path)
#插入图片到label中
photo = image.open("./rely/w_b.png") # 括号里为需要显示在图形化界面里的图片
photo = photo.resize((150, 40)) # 规定图片大小
img0 = imagetk.photoimage(photo)
l1=ttk.label(window,imag=img0,justify='center')
l1.pack()
l3_var=stringvar()
l3=ttk.label(window,background='yellow',textvar=l3_var)
l3.place(x=120,y=42)
l3_var.set('还没搜索')
l1=ttk.label(window,text='关键字或\n用户id：')
l1.place(x=13,y=60)
e1=ttk.entry(window,justify='center')
e1.place(x=80,y=65)
l4=ttk.label(window,text='磁盘:')
l4.place(x=13,y=100)
disk_list=['c','d','e','f','g','h','i']
c1=ttk.combobox(window,justify='center',state='readonly',width=17,value=disk_list)
#combobox默认选中索引为0的项目 即 c盘
c1.bind('<<comboboxselected>>', print_path)
c1.place(x=80,y=100)
r1_var=intvar()
r1_var.set(1)#默认选中为1
check1=checkbutton(window,text='下载完\n打开文件夹',command=switch)
check1.place(x=223,y=90)
b1=ttk.button(window,text='搜索',command=lambda:thread_it(wb_search),width=7)
b1.place(x=230,y=63)
l5=ttk.label(window,text='用户列表:')
l5.place(x=13,y=150)
lb1_var=stringvar()
listb1=listbox(window,justify='center',listvariable=lb1_var,width=20,height=4)
listb1.place(x=80,y=135)
b2=ttk.button(window,text='开始爬取',command=lambda :thread_it(wb_pics_parse,),width=7)
b2.place(x=230,y=160)
l6=ttk.label(window,text='状态：')
l6.place(x=13,y=280)
t1=text(window,width=23,font=('times new roman',10),state='disable')
t1.place(x=80,y=230,height=140)
b3=ttk.button(window,text=' 打开\n文件夹',width=7,command=open_disk)
b3.place(x=230,y=230)
b3=ttk.button(window,text='退出',width=7,command=window_quit)
b3.place(x=230,y=315)
f1 = ttk.labelframe(window)
f1.place(x=65,y=350)
l6=ttk.label(f1,text='敬告：本软件仅供学习交流使用！',foreground='red')
l6.pack(anchor="w",fill=x)
#绑定esc键---退出
window.bind('<escape>',escape)
#使用return键给输入框entry绑定enter事件---search搜索
e1.bind('<return>',enter)
#加入主窗口销毁事件
window.protocol('wm_delete_window',window_quit)
window.mainloop()

五．总结说明

本软件仅供学习交流使用！图源水印，在此仅作举例！
由于这是第一次做gui，因此遇到了一些问题，在此列举一下：
1.窗口布局问题（gui基础）
2.主窗口执行一个比较耗时操作导致卡死、崩溃（线程问题）。
3.主窗口关闭后，后台线程还在运行（线程问题）。

以上问题已经全部解决，软件切实可用。

另外,本软件有四大亮点：

1.使用线程下载图片
2.智能标签提醒
3.输入关键字直接敲回车能够完成搜索
4.esc快速退出软件
软件打包好了放在了蓝奏云https://wws.lanzous.com/ipspzkchj5i

以上就是python制作微博图片爬取工具的详细内容，更多关于python 微博图片爬取的资料请关注www.887551.com其它相关文章！

黄山市民网：https://www.huangshanshimin.com/