在线测试地址:simple_ocr
可识别百度贴吧旋转验证码,本人测试识别率为85%
github链接在文字尾部
以下是识别代码
r'''
百度贴吧验证码图片'''import tensorflow as tf
import keras.backend as Kimport cv2
import numpy as np
import os
from PIL import Imageclass my_ocr():def __init__(self):# physical_devices = tf.config.list_physical_devices('GPU')# tf.config.experimental.set_memory_growth(physical_devices[0], True)self.model = tf.keras.models.load_model(os.path.abspath("旋转验证码.hdf5"), custom_objects={'angle_error':self.angle_error})def angle_difference(self,x, y):return 180 - abs(abs(x - y) - 180)#def angle_error(self,y_true, y_pred):diff = self.angle_difference(K.argmax(y_true), K.argmax(y_pred))return K.mean(K.cast(K.abs(diff), K.floatx()))def round(self,img_path, times):ima = Image.open(img_path).convert("RGBA")size = ima.size# 要使用圆形,所以使用刚才处理好的正方形的图片r2 = min(size[0], size[1])if size[0] != size[1]:ima = ima.resize((r2, r2), Image.ANTIALIAS)# 最后生成圆的半径r3 = int(r2 / 2)imb = Image.new('RGBA', (r3 * 2, r3 * 2), (255, 255, 255, 0))pima = ima.load() # 像素的访问对象pimb = imb.load()r = float(r2 / 2) # 圆心横坐标for i in range(r2):for j in range(r2):lx = abs(i - r) # 到圆心距离的横坐标ly = abs(j - r) # 到圆心距离的纵坐标l = (pow(lx, 2) + pow(ly, 2)) ** 0.5 # 三角函数 半径if l < r3:pimb[i - (r - r3), j - (r - r3)] = pima[i, j]imb.save(times)returndef gray(self,img_path):img = Image.open(img_path)Img = img.convert('L')Img.save(img_path)#旋转角度预测def identification(self,img_path):''':param img_path: 图片地址 例如:D/1.png:return:'''# self.round(img_path,img_path) #将图片变圆self.gray(img_path) #转为灰度图片img_init = cv2.imread(img_path)img_init = cv2.resize(img_init, (224, 224)) # 将图片大小调整到224*224用于模型推理img = np.asarray(img_init)outputs = self.model.predict(img.reshape(1, 224, 224, 3))# print(outputs)result_index = int(np.argmax(outputs))class_names = ['0', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '20', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '21', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '22', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229', '23', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '24', '240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '25', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '26', '260', '261', '262', '263', '264', '265', '266', '267', '268', '269', '27', '270', '271', '272', '273', '274', '275', '276', '277', '278', '279', '28', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '29', '290', '291', '292', '293', '294', '295', '296', '297', '298', '299', '3', '30', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '31', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '32', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '33', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '34', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '35', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']result = class_names[result_index] # 获得名称return result
以下是调用代码
from ocr_code import my_ocrwith open('1.jpg','wb')as f:f.write(img)f.close()
#1.png及是需要识别的旋转验证码result = my_ocr.identification('./1.png')print(result)
输入示例:
输出示例(上面的图片无关):
以下是测试代码:
import cv2
from selenium.webdriver.common.by import By
import random
from selenium.webdriver import ActionChainsfrom selenium import webdriver
import time
from lxml import etree
import requests
from selenium.webdriver.chrome.options import Options# import my_ocr
from my_ocr import my_ocrclass my_web():def __init__(self):option = Options()option.add_experimental_option('excludeSwitches', ['enable-automation'])option.add_argument('--disable-blink-features=AutomationControlled')self.driver = webdriver.Chrome(chrome_options=option)def __ease_out_expo(self,sep):if sep == 1:return 1else:return 1 - pow(2, -10 * sep)def generate_tracks(self, distance):"""根据滑动距离生成滑动轨迹:param distance: 需要滑动的距离:return: 滑动轨迹<type 'list'>: [[x,y,t], ...]x: 已滑动的横向距离y: 已滑动的纵向距离, 除起点外, 均为0t: 滑动过程消耗的时间, 单位: 毫秒"""distance = int(distance)if not isinstance(distance, int) or distance < 0:raise ValueError(f"distance类型必须是大于等于0的整数: distance: {distance}, type: {type(distance)}")# 初始化轨迹列表slide_track = [# [random.randint(-50, -10), random.randint(-50, -10), 0],[0, 0, 0],]# 共记录count次滑块位置信息count = 30 + int(distance / 2)# 初始化滑动时间t = random.randint(50, 100)# 记录上一次滑动的距离_x = 0_y = 0for i in range(count):# 已滑动的横向距离x = round(self.__ease_out_expo(i / count) * distance)# 滑动过程消耗的时间t += random.randint(10, 20)if x == _x:continueslide_track.append([x, _y, t])_x = xslide_track.append(slide_track[-1])return slide_trackdef main(self):self.driver.get('https://wappass.baidu.com/static/captcha/tuxing.html?ak=2ef521ec36290baed33d66de9b16f625&backurl=http%3A%2F%2Ftieba.baidu.com%2F×tamp=1655176222&signature=7166d8dcec4ed272e5d84314de53e574') # 打开url网页 比如 driver.get("http://www.baidu.com")ocr = my_ocr()while True:# 打开网页1time.sleep(3)html = self.driver.page_source# print(html)html = etree.HTML(html)url = html.xpath('//*[@class="vcode-spin-img"]/@src')[0]response = requests.get(url).contentwith open('./1.png', 'wb')as f:f.write(response)result = ocr.identification('./1.png')displacement_distance = 212 / 360 * int(result)print('预测旋转角度为:',result,'滑动距离为:',displacement_distance)source = self.driver.find_element(By.XPATH, r'//*[@class="vcode-spin-button"]/p')action = ActionChains(self.driver, duration=10)action.click_and_hold(source).perform()a = 0for x in self.generate_tracks(displacement_distance):# time.sleep(random.uniform(0.1,0.2))# print(x)action.move_by_offset(xoffset=x[0]-a, yoffset=x[1])a = x[0]action.release(source).perform()# ActionChains(self.driver).drag_and_drop_by_offset(b, xoffset=displacement_distance, yoffset=0).perform()time.sleep(2)if '热门吧' in self.driver.page_source:break# else:## with open('./{}.png'.format(int(time.time())), 'wb')as f:# f.write(response)if __name__ == '__main__':a = my_web()a.main()
该项目包含多种验证码的识别(还在敲…现在只是个壳子,公司有需求要搞饿了么的爬取,所以说最近更新频率会大幅下降)
github: https://github.com/Bump-mann/simple_ocr
模型下载:
1、csdn(我设置的免费,如果收费了,请联系我改掉)
https://download.csdn.net/download/qq_44749634/86260445
2、百度网盘:链接:https://pan.baidu.com/s/1ZiDxjzzfDm-oVq2oxSjaHg?pwd=xk69
提取码:xk69
————————以下是笔者的碎碎念————————————
嗯…把最近做的滑块、旋转、面积做成了接口,想法是再加点其他类型的验证码,然后github开源,目前开源的验证码识别还是蛮少的,明明有不少人还不会搞,为什么不开源出来,帮助大家呢?
参考ddddocr做成一个包?感觉不错,可以帮到很多同行,想想就开心