使用百度表格文字识别(异步接口)

近期百度的文字识别接口进行了很大的改进,不再使用原先的Aip,而是转用Token机制。
百度的通用文字识别价格未变动,表格文字为每天提交请求为50次,接收请求暂无限制。图片中的表格识别后,可转化为excel文件或者json。

识别转化为excel文件

import requests
import json
import base64
import time
#获取access_token。因为access_token变换时间较短,需要每次使用时获取。
def get_access_token():
    client_id = 'KwXkGawxh0sjOQdF9Ae9LeLb'
    client_secret = 'siprEKMp5UcRTOAngEfIOOe9x6xkqGXq'   
     # client_id 为官网获取的AK, client_secret 为官网获取的SK
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
        client_id, client_secret)
    response = requests.get(host).text
    data = json.loads(response)
    access_token = data['access_token']
    return access_token

def get_excel(requests_id, access_token):
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    pargams = {
        'request_id': requests_id,
        'result_type': 'excel' #默认是excel
    }
    url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
    url_all = url + "?access_token=" + access_token
    res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
    info_1 = res.json()['result']['ret_msg']
    excel_url=res.json()['result']['result_data']
    excel_1=requests.get(excel_url).content
    with open('识别结果11.xls','wb+') as f:
        f.write(excel_1)
    print(info_1)


request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/request"
# 二进制方式打开图片文件
f = open('./data/pic/c61c9249-1d59-4ccc-9606-8fa99751f11a-11.jpg', 'rb')
img = base64.b64encode(f.read())

params = {"image":img}
access_token = get_access_token()
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
    m_xx = response.json()
requests_id = m_xx['result'][0]['request_id']  
print(requests_id)
time.sleep(10)  #时间设定略微长一些,给系统足够时间进行处理,否则返回空url
get_excel(requests_id, access_token)

获取的文件为xls文件,暂时无法获得xlsx文件。

识别转化为json

百度返回的json数据格式并不是很规范,无法直接使用,需要进行转换。

url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
excel_1=res.json()['result']['result_data']
m_xx =json.loads(excel_1)

#with open('识别结果12.json','w') as fl:
#    json.dump(m_xx['forms'][0],fl,ensure_ascii=False)
#print(info_1)
#print(json.dumps(m_xx['forms'][0],ensure_ascii=False))
print(m_xx['forms'][0])#此字典是excel文件内容及格式
print(m_xx['forms'][0]['body'])#此列表是excel文件表格内容

识别保存为json文件

url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams)#访问链接获取excel下载页
#info_1 = res.json()['result']['ret_msg']
excel_1=res.json()['result']['result_data']#['forms'][0]['body']
type(excel_1)
#excel_new = demjson.decode(excel_1)
#for m_col in excel_new['forms'][0]['body']:
#    print(m_col)
m_xx =json.loads(excel_1)

with open('识别结果12.json','w') as fl:
    json.dump(m_xx['forms'][0],fl,ensure_ascii=False)
print(info_1)

添加新评论