add new pdfs ; test partly passed

This commit is contained in:
2025-07-23 17:53:18 +08:00
parent d2f2240a6c
commit 05cd6ae7b9
19 changed files with 22 additions and 158 deletions

2
.gitignore vendored
View File

@@ -1,2 +1,2 @@
models/**
#
output/**

View File

@@ -0,0 +1 @@
output/

View File

@@ -3,122 +3,16 @@ from gradio_client import Client, handle_file
import json
import re
import os
# def extract_invoice_info(markdown_text):
# try:
# # 提取发票号码
# invoice_number = re.search(r'发票号码:\s*(\d+)', markdown_text)
# if not invoice_number:
# raise ValueError("无法提取发票号码")
# # 提取销售方名称
# seller_section = markdown_text.split('销售方信息')[-1]
# seller_name = re.search(r'名称:\s*(.*?)\n', seller_section)
# if not seller_name:
# raise ValueError("无法提取销售方名称")
# # 提取小写金额
# amount = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
# if not amount:
# raise ValueError("无法提取小写金额")
# return {
# "发票号码": invoice_number.group(1),
# "销售方名称": seller_name.group(1).strip(),
# "金额": amount.group(1)
# }
# except Exception as e:
# print(f"提取信息时出错: {e}")
# return None
def save_to_json(data, filename):
"""将数据保存为JSON文件"""
os.makedirs(os.path.dirname(filename), exist_ok=True) # 确保目录存在
markdown_text = """ Page 1 of 1
| | |
| --- | --- |
| [QR Code] | |
**发票信息**
电子发票(普通发票)
<signature>国家税务总局厦门市税务局</signature>
<signature>厦门市税务局</signature>
发票号码: 25947000000028179639
开票日期: 2025年05月20日
**购买方信息**
名称: 集美大学
统一社会信用代码/纳税人识别号: 12350000426600329N
**销售方信息**
名称: 厦门京东东和贸易有限公司
统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
<table>
<thead>
<tr>
<th>项目名称</th>
<th>规格型号</th>
<th>单位</th>
<th>数量</th>
<th>单价</th>
<th>金额</th>
<th>税率/征收率</th>
<th>税额</th>
</tr>
</thead>
<tbody>
<tr>
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
<td>ROSE-240100C</td>
<td>个</td>
<td>2</td>
<td>23.01</td>
<td>46.02</td>
<td>13%</td>
<td>5.98</td>
</tr>
<tr>
<td>*24V2A电源适配器1000mA</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>-7.18</td>
<td>13%</td>
<td>-0.93</td>
</tr>
<tr>
<td>合计</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>¥38.84</td>
<td></td>
<td>¥5.05</td>
</tr>
<tr>
<td>价税合计(大写)</td>
<td>肆拾叁圆捌角玖分</td>
<td></td>
<td></td>
<td></td>
<td>(小写) ¥43.89</td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
备 注 订单号:316584139470
开票人: 王梅"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def extract_invoice_info(markdown_text):
try:
# 提取发票号码
invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
if not invoice_match:
@@ -138,8 +32,6 @@ def extract_invoice_info(markdown_text):
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
if not amount_match:
raise ValueError("未找到金额信息")
amount = amount_match.group(1)
@@ -151,33 +43,23 @@ def extract_invoice_info(markdown_text):
"total_amount": amount,
"items": []
}
# print("amount:", amount)
# 提取商品明细
item_section = markdown_text.split('<tbody>')
if len(item_section) < 2:
raise ValueError("未找到商品明细部分")
raise ValueError("未找到商品明细部分")
print("发票号码:", invoice_number)
print("销售方名称:", seller_name)
print("金额:", amount)
# 修正表格解析逻辑
table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
if len(table_rows) < 3:
raise ValueError("商品明细数据不完整")
for row in table_rows[:-2]:
# print("row:", row)
item_name_match = re.search(r'<td>(.*?)</td>', row)
model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', row)
quantity_match = re.search(r'<td>(\d+)</td>', row)
# print("item_name_match:", item_name_match)
# print("model_match:", model_match)
# print("quantity_match:", quantity_match)
# if not all([item_name_match, model_match, quantity_match]):
# raise ValueError("商品信息解析失败")
if item_name_match is not None:
print("项目名称:", item_name_match.group(1))
@@ -202,28 +84,8 @@ def extract_invoice_info(markdown_text):
"model": model,
"quantity": quantity
}
invoice_data["items"].append(item_data)
# # 解析第一行数据
# first_row = table_rows[0] # 跳过表头
# # print("first_row:", first_row)
# item_name_match = re.search(r'<td>(.*?)</td>', first_row)
# model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', first_row)
# quantity_match = re.search(r'<td>(\d+)</td>', first_row)
# # print("item_name_match:", item_name_match)
# # print("model_match:", model_match)
# # print("quantity_match:", quantity_match)
# if not all([item_name_match, model_match, quantity_match]):
# raise ValueError("商品信息解析失败")
# print("发票号码:", invoice_number)
# print("销售方名称:", seller_name)
# print("项目名称:", item_name_match.group(1))
# print("规格型号:", model_match.group(1))
# print("数量:", quantity_match.group(1))
# print("金额:", amount)
invoice_data["items"].append(item_data)
return invoice_data
except Exception as e:
print(f"解析发票信息时出错: {str(e)}")
return None
@@ -254,10 +116,8 @@ def convert_pdf_to_markdown(
images=file_inputs,
api_name="/process_markdown_streaming"
)
return result
def get_pdf_files(directory):
pdf_files = []
for root, dirs, files in os.walk(directory):
@@ -266,8 +126,6 @@ def get_pdf_files(directory):
pdf_files.append(os.path.join(root, file))
return pdf_files
if __name__ == "__main__":
# # test extract_invoice_info function
# info = extract_invoice_info(markdown_text)
@@ -275,21 +133,26 @@ if __name__ == "__main__":
# Example usage
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
CLIENT_URL = "http://172.29.57.6:9998/"
client = Client(CLIENT_URL, auth=("admin", "admin"))
pdf_directory = "pdfs"
output_dir = "output"
pdf_files = get_pdf_files(pdf_directory)
print(pdf_files)
for pdf_file in pdf_files:
print(f"Found PDF file: {pdf_file}")
# Single image conversion
markdown_content = convert_pdf_to_markdown(
[pdf_file],client
)
markdown_content = convert_pdf_to_markdown([pdf_file],client)
# print(markdown_content)
invoice_info = extract_invoice_info(markdown_content)
print(f"Extracted invoice info: {invoice_info}")
if invoice_info:
# 生成输出文件名
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
print(f"Base name: {base_name}")
json_file = os.path.join(output_dir, f"{base_name}.json")
print(f"JSON file path: {json_file}")
# 保存为JSON
save_to_json(invoice_info, json_file)
print(f"发票信息已保存到: {json_file}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.