add new pdfs ; test partly passed
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,2 @@
|
||||
models/**
|
||||
#
|
||||
output/**
|
||||
1
docext-test/.gitignore
vendored
1
docext-test/.gitignore
vendored
@@ -0,0 +1 @@
|
||||
output/
|
||||
@@ -3,122 +3,16 @@ from gradio_client import Client, handle_file
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
# def extract_invoice_info(markdown_text):
|
||||
# try:
|
||||
# # 提取发票号码
|
||||
# invoice_number = re.search(r'发票号码:\s*(\d+)', markdown_text)
|
||||
# if not invoice_number:
|
||||
# raise ValueError("无法提取发票号码")
|
||||
|
||||
# # 提取销售方名称
|
||||
# seller_section = markdown_text.split('销售方信息')[-1]
|
||||
# seller_name = re.search(r'名称:\s*(.*?)\n', seller_section)
|
||||
# if not seller_name:
|
||||
# raise ValueError("无法提取销售方名称")
|
||||
|
||||
# # 提取小写金额
|
||||
# amount = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
|
||||
# if not amount:
|
||||
# raise ValueError("无法提取小写金额")
|
||||
|
||||
# return {
|
||||
# "发票号码": invoice_number.group(1),
|
||||
# "销售方名称": seller_name.group(1).strip(),
|
||||
# "金额": amount.group(1)
|
||||
# }
|
||||
# except Exception as e:
|
||||
# print(f"提取信息时出错: {e}")
|
||||
# return None
|
||||
|
||||
|
||||
markdown_text = """ Page 1 of 1
|
||||
| | |
|
||||
| --- | --- |
|
||||
| [QR Code] | |
|
||||
|
||||
**发票信息**
|
||||
电子发票(普通发票)
|
||||
<signature>国家税务总局厦门市税务局</signature>
|
||||
<signature>厦门市税务局</signature>
|
||||
|
||||
发票号码: 25947000000028179639
|
||||
开票日期: 2025年05月20日
|
||||
|
||||
**购买方信息**
|
||||
名称: 集美大学
|
||||
统一社会信用代码/纳税人识别号: 12350000426600329N
|
||||
|
||||
**销售方信息**
|
||||
名称: 厦门京东东和贸易有限公司
|
||||
统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
|
||||
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>项目名称</th>
|
||||
<th>规格型号</th>
|
||||
<th>单位</th>
|
||||
<th>数量</th>
|
||||
<th>单价</th>
|
||||
<th>金额</th>
|
||||
<th>税率/征收率</th>
|
||||
<th>税额</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
|
||||
<td>ROSE-240100C</td>
|
||||
<td>个</td>
|
||||
<td>2</td>
|
||||
<td>23.01</td>
|
||||
<td>46.02</td>
|
||||
<td>13%</td>
|
||||
<td>5.98</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>*24V2A电源适配器1000mA</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td>-7.18</td>
|
||||
<td>13%</td>
|
||||
<td>-0.93</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>合计</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td>¥38.84</td>
|
||||
<td></td>
|
||||
<td>¥5.05</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>价税合计(大写)</td>
|
||||
<td>肆拾叁圆捌角玖分</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td>(小写) ¥43.89</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
备 注 订单号:316584139470
|
||||
|
||||
开票人: 王梅"""
|
||||
def save_to_json(data, filename):
|
||||
"""将数据保存为JSON文件"""
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True) # 确保目录存在
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
|
||||
def extract_invoice_info(markdown_text):
|
||||
try:
|
||||
|
||||
|
||||
|
||||
# 提取发票号码
|
||||
invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
|
||||
if not invoice_match:
|
||||
@@ -138,8 +32,6 @@ def extract_invoice_info(markdown_text):
|
||||
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
|
||||
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
|
||||
|
||||
|
||||
|
||||
if not amount_match:
|
||||
raise ValueError("未找到金额信息")
|
||||
amount = amount_match.group(1)
|
||||
@@ -151,33 +43,23 @@ def extract_invoice_info(markdown_text):
|
||||
"total_amount": amount,
|
||||
"items": []
|
||||
}
|
||||
|
||||
# print("amount:", amount)
|
||||
# 提取商品明细
|
||||
item_section = markdown_text.split('<tbody>')
|
||||
if len(item_section) < 2:
|
||||
raise ValueError("未找到商品明细部分")
|
||||
|
||||
|
||||
print("发票号码:", invoice_number)
|
||||
print("销售方名称:", seller_name)
|
||||
print("金额:", amount)
|
||||
|
||||
# 修正表格解析逻辑
|
||||
table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
|
||||
if len(table_rows) < 3:
|
||||
raise ValueError("商品明细数据不完整")
|
||||
|
||||
for row in table_rows[:-2]:
|
||||
# print("row:", row)
|
||||
item_name_match = re.search(r'<td>(.*?)</td>', row)
|
||||
model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', row)
|
||||
quantity_match = re.search(r'<td>(\d+)</td>', row)
|
||||
# print("item_name_match:", item_name_match)
|
||||
# print("model_match:", model_match)
|
||||
# print("quantity_match:", quantity_match)
|
||||
# if not all([item_name_match, model_match, quantity_match]):
|
||||
# raise ValueError("商品信息解析失败")
|
||||
|
||||
if item_name_match is not None:
|
||||
print("项目名称:", item_name_match.group(1))
|
||||
@@ -203,27 +85,7 @@ def extract_invoice_info(markdown_text):
|
||||
"quantity": quantity
|
||||
}
|
||||
invoice_data["items"].append(item_data)
|
||||
|
||||
# # 解析第一行数据
|
||||
# first_row = table_rows[0] # 跳过表头
|
||||
# # print("first_row:", first_row)
|
||||
# item_name_match = re.search(r'<td>(.*?)</td>', first_row)
|
||||
# model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', first_row)
|
||||
# quantity_match = re.search(r'<td>(\d+)</td>', first_row)
|
||||
# # print("item_name_match:", item_name_match)
|
||||
# # print("model_match:", model_match)
|
||||
# # print("quantity_match:", quantity_match)
|
||||
# if not all([item_name_match, model_match, quantity_match]):
|
||||
# raise ValueError("商品信息解析失败")
|
||||
|
||||
# print("发票号码:", invoice_number)
|
||||
# print("销售方名称:", seller_name)
|
||||
# print("项目名称:", item_name_match.group(1))
|
||||
# print("规格型号:", model_match.group(1))
|
||||
# print("数量:", quantity_match.group(1))
|
||||
# print("金额:", amount)
|
||||
return invoice_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析发票信息时出错: {str(e)}")
|
||||
return None
|
||||
@@ -254,10 +116,8 @@ def convert_pdf_to_markdown(
|
||||
images=file_inputs,
|
||||
api_name="/process_markdown_streaming"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_pdf_files(directory):
|
||||
pdf_files = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
@@ -266,8 +126,6 @@ def get_pdf_files(directory):
|
||||
pdf_files.append(os.path.join(root, file))
|
||||
return pdf_files
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# # test extract_invoice_info function
|
||||
# info = extract_invoice_info(markdown_text)
|
||||
@@ -275,21 +133,26 @@ if __name__ == "__main__":
|
||||
|
||||
# Example usage
|
||||
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
|
||||
CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
|
||||
CLIENT_URL = "http://172.29.57.6:9998/"
|
||||
client = Client(CLIENT_URL, auth=("admin", "admin"))
|
||||
|
||||
pdf_directory = "pdfs"
|
||||
|
||||
output_dir = "output"
|
||||
pdf_files = get_pdf_files(pdf_directory)
|
||||
print(pdf_files)
|
||||
for pdf_file in pdf_files:
|
||||
print(f"Found PDF file: {pdf_file}")
|
||||
|
||||
# Single image conversion
|
||||
markdown_content = convert_pdf_to_markdown(
|
||||
[pdf_file],client
|
||||
)
|
||||
|
||||
markdown_content = convert_pdf_to_markdown([pdf_file],client)
|
||||
# print(markdown_content)
|
||||
invoice_info = extract_invoice_info(markdown_content)
|
||||
print(f"Extracted invoice info: {invoice_info}")
|
||||
if invoice_info:
|
||||
# 生成输出文件名
|
||||
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
||||
print(f"Base name: {base_name}")
|
||||
json_file = os.path.join(output_dir, f"{base_name}.json")
|
||||
print(f"JSON file path: {json_file}")
|
||||
|
||||
# 保存为JSON
|
||||
save_to_json(invoice_info, json_file)
|
||||
print(f"发票信息已保存到: {json_file}")
|
||||
|
||||
BIN
docext-test/pdfs/250407/250224_21.76.pdf
Normal file
BIN
docext-test/pdfs/250407/250224_21.76.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250308_53.47.pdf
Normal file
BIN
docext-test/pdfs/250407/250308_53.47.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250312_21.00.pdf
Normal file
BIN
docext-test/pdfs/250407/250312_21.00.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250313_80.00.pdf
Normal file
BIN
docext-test/pdfs/250407/250313_80.00.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250317_63.50.pdf
Normal file
BIN
docext-test/pdfs/250407/250317_63.50.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250318_70.06.pdf
Normal file
BIN
docext-test/pdfs/250407/250318_70.06.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250407/250321_26.49.pdf
Normal file
BIN
docext-test/pdfs/250407/250321_26.49.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250307_78.36.pdf
Normal file
BIN
docext-test/pdfs/250715/250307_78.36.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250308_53.47.pdf
Normal file
BIN
docext-test/pdfs/250715/250308_53.47.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250309_4.81.pdf
Normal file
BIN
docext-test/pdfs/250715/250309_4.81.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250312_49.18.pdf
Normal file
BIN
docext-test/pdfs/250715/250312_49.18.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250405_5.60.pdf
Normal file
BIN
docext-test/pdfs/250715/250405_5.60.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250516_11.50.pdf
Normal file
BIN
docext-test/pdfs/250715/250516_11.50.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250520_43.89.pdf
Normal file
BIN
docext-test/pdfs/250715/250520_43.89.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250604_970.pdf
Normal file
BIN
docext-test/pdfs/250715/250604_970.pdf
Normal file
Binary file not shown.
BIN
docext-test/pdfs/250715/250604_970_2.pdf
Normal file
BIN
docext-test/pdfs/250715/250604_970_2.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user