安装、导入第三方模块

安装:

pip3 install Pillow

image 模块:
Image模块是在Python PIL图像处理中常见的模块,主要是用于对这个图像的基本处理,它配合open、save、convert、show…等功能使用。
Crop类:
拷贝这个图像。如果用户想粘贴一些数据到这张图,可以使用这个方法,但是原始图像不会受到影响。

导入:

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

1.获得验证码图片

分析

进入12306的登陆页面,F12进入开发者模式,点击刷新图片验证码,找到获取图片的接口。
找到验证码图片信息。
发送请求,获得响应,解析响应信息,提取返回的图片信息,进行Base64解码,保存到本地。

保存验证码图片

发送请求,获得响应,解析响应信息,提取返回的图片信息,进行Base64解码,保存到本地。

captcha_url = "https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&1607391410127&callback=jQuery19109754414031805079_1607391094834&_=1607391094838"
headers = { 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
  }

response = session.get(url=captcha_url,  headers=headers).text
respJson = response.split("(")[1].replace(');','')
respD = json.loads(respJson)
print(respD)
image_bs64 = respD['image']
print('解密前:',image_bs64)
captcha_response = base64.b64decode(image_bs64)
print('解密后:',base64.b64decode(captcha_response))
#最后将获取的验证图存放起来
with open('captcha.jpg', 'wb') as f:
    f.write(captcha_response)

切割图像

由于下载的图片都是有固定的位置,所以直接控制像素进行切割就行了。

# 切割图像,由于下载的图片都是有固定的位置,所以直接控制像素进行切割就行了
def cut_img(im, x, y):
    assert 0 <= x <= 3
    assert 0 <= y <= 2
    left = 5 + (67 + 5) * x
    top = 41 + (67 + 5) * y
    right = left + 67
    bottom = top + 67
    return im.crop((left, top, right, bottom))

if __name__ == '__main__':
    im = Image.open("captcha.jpg")
    # 控制y轴
    for y in range(2):
        # 控制x轴
        for x in range(4):
            im2 = cut_img(im, x, y)
            im2.save('./images/%s_%s.png' % (y, x))

2.分析验证码图片

上一步我们获得了验证码的图片,现在需要将验证码进行切分,将其中的子图片信息分割出来,然后进行识别。

2.0获取access_token

注意:图片识别接口的access_token和文字识别接口的access_token不一样,因为对应的API KeySecret Key不一样

2.1分析验证码文字信息

这个调用百度智能云平台的文字识别接口:

#获取文字识别token
def get_TextToken():
    get_token_url = "https://aip.baidubce.com/oauth/2.0/token"
    params = { 
        "grant_type": "client_credentials",
        "client_id": "2***********s",
        "client_secret": "***********a",
    }
    res = requests.get(get_token_url, params).json()
    # print(res)
    return res["access_token"]

# 识别图中文字:
def get_TextResult(access_token, image):
    url = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
    # 打开文件并进行编码
    with open(image, 'rb')as f:
        image = base64.b64encode(f.read())
    # 头部信息
    headers = { 
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    # 发送数据
    data = { 
        "access_token": access_token,
        "image": image
    }
    # 发送请求,并返回识别数据
    res = requests.post(url, headers=headers, data=data).json()
    # print(res) # {'log_id': 3563987230318319048, 'words_result_num': 1, 'words_result': [{'words': '龙舟一水'}]}
    if res:
        result = res['words_result'][0]['words']
        return result

2.2分析子图信息

""" 获取图片识别access_token client_id 为官网获取的AK, client_secret 为官网获取的SK """
def get_token():
    get_token_url = "https://aip.baidubce.com/oauth/2.0/token"
    params = { 
        "grant_type": "client_credentials",
        "client_id": "*****************T0n",
        "client_secret": "c*****************r",
    }
    res = requests.get(get_token_url, params).json()
    # print(res)
    return res["access_token"]
""" 通过权限验证码和图片进行识别物品 """
def get_result(access_token, image):
    url = "https://aip.baidubce.com/rest/2.0/image-classify/v2/advanced_general"
    # 打开文件并进行编码
    with open(image, 'rb')as f:
        image = base64.b64encode(f.read())
    # image =
    # 头部信息
    headers = { 
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    # 发送数据
    data = { 
        "access_token": access_token,
        "image": image
    }
    # 发送请求,并返回识别数据
    res = requests.post(url, headers=headers, data=data).json()
    # print(res)
    if res:
        result = res['result']
        # print(result)# [{'score': 0.742749, 'root': '商品-电脑办公', 'keyword': '电脑'}, {'score': 0.584222, 'root': '商品-原材料', 'keyword': '板材'}, {'score': 0.426388, 'root': '商品-电脑办公', 'keyword': '拷贝台'}, {'score': 0.269731, 'root': '商品-家用电器', 'keyword': '吸油烟机'}, {'score': 0.103768, 'root': '商品-电脑办公', 'keyword': '扫描仪'}]
        return result

# 获取图片关键物品
def get_keywords(result):
    # 按照最大匹配率进行排序,并获取左最后一个
    max_score = sorted(result, key=lambda x: x['score'])[-1]
    # print(max_score['keyword'])
    keyword = max_score['keyword']
    return keyword

完整代码

Baidu.main

# author: LiuShihao
# data: 2020/12/8 12:45 下午
# youknow: 各位老铁,我的这套代码曾经有人出价三个亿我没有卖,如今拿出来和大家分享,不求别的,只求大家免费的小红心帮忙点一点,这里谢过了。
# desc:
import base64
import requests
import time
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


""" 获取图片识别access_token client_id 为官网获取的AK, client_secret 为官网获取的SK """
def get_token():
    get_token_url = "https://aip.baidubce.com/oauth/2.0/token"
    params = { 
        "grant_type": "client_credentials",
        "client_id": "Gra*************X7T0n",
        "client_secret": "c*************FiVr",
    }
    res = requests.get(get_token_url, params).json()
    # print(res)
    return res["access_token"]

def get_TextToken():
    get_token_url = "https://aip.baidubce.com/oauth/2.0/token"
    params = { 
        "grant_type": "client_credentials",
        "client_id": "*************UCs",
        "client_secret": "U*************mXBa",
    }
    res = requests.get(get_token_url, params).json()
    # print(res)
    return res["access_token"]

""" 通过权限验证码和图片进行识别物品 """
def get_result(access_token, image):
    url = "https://aip.baidubce.com/rest/2.0/image-classify/v2/advanced_general"
    # 打开文件并进行编码
    with open(image, 'rb')as f:
        image = base64.b64encode(f.read())
    # image =
    # 头部信息
    headers = { 
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    # 发送数据
    data = { 
        "access_token": access_token,
        "image": image
    }
    # 发送请求,并返回识别数据
    res = requests.post(url, headers=headers, data=data).json()
    # print(res)
    if res:
        result = res['result']
        # print(result)# [{'score': 0.742749, 'root': '商品-电脑办公', 'keyword': '电脑'}, {'score': 0.584222, 'root': '商品-原材料', 'keyword': '板材'}, {'score': 0.426388, 'root': '商品-电脑办公', 'keyword': '拷贝台'}, {'score': 0.269731, 'root': '商品-家用电器', 'keyword': '吸油烟机'}, {'score': 0.103768, 'root': '商品-电脑办公', 'keyword': '扫描仪'}]
        return result

# 识别图中文字:
def get_TextResult(access_token, image):
    url = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
    # 打开文件并进行编码
    with open(image, 'rb')as f:
        image = base64.b64encode(f.read())
    # 头部信息
    headers = { 
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    # 发送数据
    data = { 
        "access_token": access_token,
        "image": image
    }
    # 发送请求,并返回识别数据
    res = requests.post(url, headers=headers, data=data).json()
    # print(res) # {'log_id': 3563987230318319048, 'words_result_num': 1, 'words_result': [{'words': '龙舟一水'}]}
    if res:
        result = res['words_result'][0]['words']
        return result
# 获取图片关键物品
def get_keywords(result):
    # 按照最大匹配率进行排序,并获取左最后一个
    max_score = sorted(result, key=lambda x: x['score'])[-1]
    # print(max_score['keyword'])
    keyword = max_score['keyword']
    return keyword

if __name__ == '__main__':
    access_token = get_TextToken()
    result = get_TextResult(access_token,'shanhu.png')
    print(result)

main.py

# author: LiuShihao
# data: 2020/12/8 9:05 上午
# youknow: 各位老铁,我的这套代码曾经有人出价三个亿我没有卖,如今拿出来和大家分享,不求别的,只求大家免费的小红心帮忙点一点,这里谢过了。
# desc:

import os,time,json,re,base64,requests
from selenium import webdriver
from lxml import etree
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import Baidu.main
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


captcha_url = "https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&1607391410127&callback=jQuery19109754414031805079_1607391094834&_=1607391094838"
headers = { 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
  }
#用session.get获取验证图片的页面
""" 请求接口获得验证码图片,保存到本地 """
def getImage():
    response = session.get(url=captcha_url, headers=headers).text
    respJson = response.split("(")[1].replace(');', '')
    respD = json.loads(respJson)
    # print(respD)
    image_bs64 = respD['image']
    # print('解密前:', image_bs64)
    captcha_response = base64.b64decode(image_bs64)
    # print('解密后:', base64.b64decode(captcha_response))
    # 最后将获取的验证图存放起来
    with open(os.getcwd()+'/captcha.jpg', 'wb') as f:
        f.write(captcha_response)
    # print('验证码目录:',os.getcwd()+'/captcha.jpg')
    return os.getcwd()+'/captcha.jpg'
""" 将验证码 中的文字信息部分裁剪下来,保存为text.png文件 将八幅子图分别裁剪下来保存,,由于下载的图片都是有固定的位置,所以直接控制像素进行切割就行了 """
def cut_img(image):
    image = Image.open(image)
    im1 = image.crop((120, 0, 295, 30))
    im1.save('text.png')
    # 控制y轴
    for y in range(2):
        # 控制x轴
        for x in range(4):
            assert 0 <= x <= 3
            assert 0 <= y <= 2
            left = 5 + (67 + 5) * x
            top = 41 + (67 + 5) * y
            right = left + 67
            bottom = top + 67
            im2 = image.crop((left, top, right, bottom))
            # print('子图目录:',os.getcwd()+'/images/')
            im2.save(os.getcwd()+'/images/%s_%s.png' % (y, x))
if __name__ == '__main__':
    # 获得验证码保存本地 Login/captcha.jpg
    image_path = getImage()
    # 裁剪验证码 保存到本地 Login/images/
    cut_img(image_path)
    # 获得文字识别接口的access_token
    access_token_Text = Baidu.main.get_TextToken()
    # 调用文字识别接口分析文字
    resultText = Baidu.main.get_TextResult(access_token_Text,'text.png')
    print('resultText:',resultText)
    # 获得图像识别接口的access_token
    access_token = Baidu.main.get_token()
    # 调用百度图片识别接口分析图片
    keywords = []
    for y in range(2):
        for x in range(4):
            image_path = './images/%s_%s.png' % (y, x)
            result = Baidu.main.get_result(access_token,image_path)
            keyword = Baidu.main.get_keywords(result)
            # time.sleep(1)
            keywords.append(keyword)
    print('keywords:',keywords)

效果

本文地址:https://blog.csdn.net/DreamsArchitects/article/details/110850388