近期百度的文字识别接口进行了很大的改进,不再使用原先的Aip,而是转用Token机制。
百度的通用文字识别价格未变动,表格文字为每天提交请求为50次,接收请求暂无限制。图片中的表格识别后,可转化为excel文件或者json。
识别转化为excel文件
import requests
import json
import base64
import time
#获取access_token。因为access_token变换时间较短,需要每次使用时获取。
def get_access_token():
client_id = 'KwXkGawxh0sjOQdF9Ae9LeLb'
client_secret = 'siprEKMp5UcRTOAngEfIOOe9x6xkqGXq'
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
client_id, client_secret)
response = requests.get(host).text
data = json.loads(response)
access_token = data['access_token']
return access_token
def get_excel(requests_id, access_token):
headers = {'content-type': 'application/x-www-form-urlencoded'}
pargams = {
'request_id': requests_id,
'result_type': 'excel' #默认是excel
}
url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
info_1 = res.json()['result']['ret_msg']
excel_url=res.json()['result']['result_data']
excel_1=requests.get(excel_url).content
with open('识别结果11.xls','wb+') as f:
f.write(excel_1)
print(info_1)
request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/request"
# 二进制方式打开图片文件
f = open('./data/pic/c61c9249-1d59-4ccc-9606-8fa99751f11a-11.jpg', 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
access_token = get_access_token()
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
m_xx = response.json()
requests_id = m_xx['result'][0]['request_id']
print(requests_id)
time.sleep(10) #时间设定略微长一些,给系统足够时间进行处理,否则返回空url
get_excel(requests_id, access_token)
获取的文件为xls文件,暂时无法获得xlsx文件。
识别转化为json
百度返回的json数据格式并不是很规范,无法直接使用,需要进行转换。
url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
excel_1=res.json()['result']['result_data']
m_xx =json.loads(excel_1)
#with open('识别结果12.json','w') as fl:
# json.dump(m_xx['forms'][0],fl,ensure_ascii=False)
#print(info_1)
#print(json.dumps(m_xx['forms'][0],ensure_ascii=False))
print(m_xx['forms'][0])#此字典是excel文件内容及格式
print(m_xx['forms'][0]['body'])#此列表是excel文件表格内容
识别保存为json文件
url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
#info_1 = res.json()['result']['ret_msg']
excel_1=res.json()['result']['result_data']#['forms'][0]['body']
type(excel_1)
#excel_new = demjson.decode(excel_1)
#for m_col in excel_new['forms'][0]['body']:
# print(m_col)
m_xx =json.loads(excel_1)
with open('识别结果12.json','w') as fl:
json.dump(m_xx['forms'][0],fl,ensure_ascii=False)
print(info_1)