diff --git a/.gitignore b/.gitignore
index 4e8ff1b..270567b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
models/**
-#
\ No newline at end of file
+output/**
\ No newline at end of file
diff --git a/docext-test/.gitignore b/docext-test/.gitignore
index e69de29..9b1960e 100644
--- a/docext-test/.gitignore
+++ b/docext-test/.gitignore
@@ -0,0 +1 @@
+output/
\ No newline at end of file
diff --git a/docext-test/docext_api_test.py b/docext-test/docext_api_test.py
index acf9f1c..41d0517 100644
--- a/docext-test/docext_api_test.py
+++ b/docext-test/docext_api_test.py
@@ -3,122 +3,16 @@ from gradio_client import Client, handle_file
import json
import re
import os
-# def extract_invoice_info(markdown_text):
-# try:
-# # 提取发票号码
-# invoice_number = re.search(r'发票号码:\s*(\d+)', markdown_text)
-# if not invoice_number:
-# raise ValueError("无法提取发票号码")
-
-# # 提取销售方名称
-# seller_section = markdown_text.split('销售方信息')[-1]
-# seller_name = re.search(r'名称:\s*(.*?)\n', seller_section)
-# if not seller_name:
-# raise ValueError("无法提取销售方名称")
-
-# # 提取小写金额
-# amount = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
-# if not amount:
-# raise ValueError("无法提取小写金额")
-
-# return {
-# "发票号码": invoice_number.group(1),
-# "销售方名称": seller_name.group(1).strip(),
-# "金额": amount.group(1)
-# }
-# except Exception as e:
-# print(f"提取信息时出错: {e}")
-# return None
+def save_to_json(data, filename):
+ """将数据保存为JSON文件"""
+ os.makedirs(os.path.dirname(filename), exist_ok=True) # 确保目录存在
-markdown_text = """ Page 1 of 1
-| | |
-| --- | --- |
-| [QR Code] | |
-
-**发票信息**
-电子发票(普通发票)
-国家税务总局厦门市税务局
-厦门市税务局
-
-发票号码: 25947000000028179639
-开票日期: 2025年05月20日
-
-**购买方信息**
-名称: 集美大学
-统一社会信用代码/纳税人识别号: 12350000426600329N
-
-**销售方信息**
-名称: 厦门京东东和贸易有限公司
-统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
-
-
-
-
-| 项目名称 |
-规格型号 |
-单位 |
-数量 |
-单价 |
-金额 |
-税率/征收率 |
-税额 |
-
-
-
-
-| *计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线 |
-ROSE-240100C |
-个 |
-2 |
-23.01 |
-46.02 |
-13% |
-5.98 |
-
-
-| *24V2A电源适配器1000mA |
- |
- |
- |
- |
--7.18 |
-13% |
--0.93 |
-
-
-| 合计 |
- |
- |
- |
- |
-¥38.84 |
- |
-¥5.05 |
-
-
-| 价税合计(大写) |
-肆拾叁圆捌角玖分 |
- |
- |
- |
-(小写) ¥43.89 |
- |
- |
-
-
-
-
-备 注 订单号:316584139470
-
-开票人: 王梅"""
-
+ with open(filename, 'w', encoding='utf-8') as f:
+ json.dump(data, f, ensure_ascii=False, indent=4)
def extract_invoice_info(markdown_text):
try:
-
-
-
# 提取发票号码
invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
if not invoice_match:
@@ -138,8 +32,6 @@ def extract_invoice_info(markdown_text):
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
-
-
if not amount_match:
raise ValueError("未找到金额信息")
amount = amount_match.group(1)
@@ -151,33 +43,23 @@ def extract_invoice_info(markdown_text):
"total_amount": amount,
"items": []
}
-
# print("amount:", amount)
# 提取商品明细
item_section = markdown_text.split('')
if len(item_section) < 2:
- raise ValueError("未找到商品明细部分")
-
-
+ raise ValueError("未找到商品明细部分")
print("发票号码:", invoice_number)
print("销售方名称:", seller_name)
print("金额:", amount)
-
# 修正表格解析逻辑
table_rows = re.findall(r'.*?
', item_section[1],re.DOTALL)
if len(table_rows) < 3:
raise ValueError("商品明细数据不完整")
-
for row in table_rows[:-2]:
# print("row:", row)
item_name_match = re.search(r'(.*?) | ', row)
model_match = re.search(r']*>.*? | \s*]*>(.*?) | ', row)
quantity_match = re.search(r'(\d+) | ', row)
- # print("item_name_match:", item_name_match)
- # print("model_match:", model_match)
- # print("quantity_match:", quantity_match)
- # if not all([item_name_match, model_match, quantity_match]):
- # raise ValueError("商品信息解析失败")
if item_name_match is not None:
print("项目名称:", item_name_match.group(1))
@@ -202,28 +84,8 @@ def extract_invoice_info(markdown_text):
"model": model,
"quantity": quantity
}
- invoice_data["items"].append(item_data)
-
- # # 解析第一行数据
- # first_row = table_rows[0] # 跳过表头
- # # print("first_row:", first_row)
- # item_name_match = re.search(r'(.*?) | ', first_row)
- # model_match = re.search(r']*>.*? | \s*]*>(.*?) | ', first_row)
- # quantity_match = re.search(r'(\d+) | ', first_row)
- # # print("item_name_match:", item_name_match)
- # # print("model_match:", model_match)
- # # print("quantity_match:", quantity_match)
- # if not all([item_name_match, model_match, quantity_match]):
- # raise ValueError("商品信息解析失败")
-
- # print("发票号码:", invoice_number)
- # print("销售方名称:", seller_name)
- # print("项目名称:", item_name_match.group(1))
- # print("规格型号:", model_match.group(1))
- # print("数量:", quantity_match.group(1))
- # print("金额:", amount)
+ invoice_data["items"].append(item_data)
return invoice_data
-
except Exception as e:
print(f"解析发票信息时出错: {str(e)}")
return None
@@ -254,10 +116,8 @@ def convert_pdf_to_markdown(
images=file_inputs,
api_name="/process_markdown_streaming"
)
-
return result
-
def get_pdf_files(directory):
pdf_files = []
for root, dirs, files in os.walk(directory):
@@ -266,8 +126,6 @@ def get_pdf_files(directory):
pdf_files.append(os.path.join(root, file))
return pdf_files
-
-
if __name__ == "__main__":
# # test extract_invoice_info function
# info = extract_invoice_info(markdown_text)
@@ -275,21 +133,26 @@ if __name__ == "__main__":
# Example usage
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
- CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
+ CLIENT_URL = "http://172.29.57.6:9998/"
client = Client(CLIENT_URL, auth=("admin", "admin"))
-
pdf_directory = "pdfs"
-
+ output_dir = "output"
pdf_files = get_pdf_files(pdf_directory)
print(pdf_files)
for pdf_file in pdf_files:
print(f"Found PDF file: {pdf_file}")
-
# Single image conversion
- markdown_content = convert_pdf_to_markdown(
- [pdf_file],client
- )
-
+ markdown_content = convert_pdf_to_markdown([pdf_file],client)
# print(markdown_content)
invoice_info = extract_invoice_info(markdown_content)
print(f"Extracted invoice info: {invoice_info}")
+ if invoice_info:
+ # 生成输出文件名
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
+ print(f"Base name: {base_name}")
+ json_file = os.path.join(output_dir, f"{base_name}.json")
+ print(f"JSON file path: {json_file}")
+
+ # 保存为JSON
+ save_to_json(invoice_info, json_file)
+ print(f"发票信息已保存到: {json_file}")
diff --git a/docext-test/pdfs/250407/250224_21.76.pdf b/docext-test/pdfs/250407/250224_21.76.pdf
new file mode 100644
index 0000000..bc00336
Binary files /dev/null and b/docext-test/pdfs/250407/250224_21.76.pdf differ
diff --git a/docext-test/pdfs/250407/250308_53.47.pdf b/docext-test/pdfs/250407/250308_53.47.pdf
new file mode 100644
index 0000000..3d5bf80
Binary files /dev/null and b/docext-test/pdfs/250407/250308_53.47.pdf differ
diff --git a/docext-test/pdfs/250407/250312_21.00.pdf b/docext-test/pdfs/250407/250312_21.00.pdf
new file mode 100644
index 0000000..6cd7099
Binary files /dev/null and b/docext-test/pdfs/250407/250312_21.00.pdf differ
diff --git a/docext-test/pdfs/250407/250313_80.00.pdf b/docext-test/pdfs/250407/250313_80.00.pdf
new file mode 100644
index 0000000..5ba4077
Binary files /dev/null and b/docext-test/pdfs/250407/250313_80.00.pdf differ
diff --git a/docext-test/pdfs/250407/250317_63.50.pdf b/docext-test/pdfs/250407/250317_63.50.pdf
new file mode 100644
index 0000000..0140a33
Binary files /dev/null and b/docext-test/pdfs/250407/250317_63.50.pdf differ
diff --git a/docext-test/pdfs/250407/250318_70.06.pdf b/docext-test/pdfs/250407/250318_70.06.pdf
new file mode 100644
index 0000000..5d02024
Binary files /dev/null and b/docext-test/pdfs/250407/250318_70.06.pdf differ
diff --git a/docext-test/pdfs/250407/250321_26.49.pdf b/docext-test/pdfs/250407/250321_26.49.pdf
new file mode 100644
index 0000000..a3a2a24
Binary files /dev/null and b/docext-test/pdfs/250407/250321_26.49.pdf differ
diff --git a/docext-test/pdfs/250715/250307_78.36.pdf b/docext-test/pdfs/250715/250307_78.36.pdf
new file mode 100644
index 0000000..33f5a59
Binary files /dev/null and b/docext-test/pdfs/250715/250307_78.36.pdf differ
diff --git a/docext-test/pdfs/250715/250308_53.47.pdf b/docext-test/pdfs/250715/250308_53.47.pdf
new file mode 100644
index 0000000..3d5bf80
Binary files /dev/null and b/docext-test/pdfs/250715/250308_53.47.pdf differ
diff --git a/docext-test/pdfs/250715/250309_4.81.pdf b/docext-test/pdfs/250715/250309_4.81.pdf
new file mode 100644
index 0000000..e153545
Binary files /dev/null and b/docext-test/pdfs/250715/250309_4.81.pdf differ
diff --git a/docext-test/pdfs/250715/250312_49.18.pdf b/docext-test/pdfs/250715/250312_49.18.pdf
new file mode 100644
index 0000000..93f3883
Binary files /dev/null and b/docext-test/pdfs/250715/250312_49.18.pdf differ
diff --git a/docext-test/pdfs/250715/250405_5.60.pdf b/docext-test/pdfs/250715/250405_5.60.pdf
new file mode 100644
index 0000000..b994bfb
Binary files /dev/null and b/docext-test/pdfs/250715/250405_5.60.pdf differ
diff --git a/docext-test/pdfs/250715/250516_11.50.pdf b/docext-test/pdfs/250715/250516_11.50.pdf
new file mode 100644
index 0000000..52eb0be
Binary files /dev/null and b/docext-test/pdfs/250715/250516_11.50.pdf differ
diff --git a/docext-test/pdfs/250715/250520_43.89.pdf b/docext-test/pdfs/250715/250520_43.89.pdf
new file mode 100644
index 0000000..ae933e2
Binary files /dev/null and b/docext-test/pdfs/250715/250520_43.89.pdf differ
diff --git a/docext-test/pdfs/250715/250604_970.pdf b/docext-test/pdfs/250715/250604_970.pdf
new file mode 100644
index 0000000..0117552
Binary files /dev/null and b/docext-test/pdfs/250715/250604_970.pdf differ
diff --git a/docext-test/pdfs/250715/250604_970_2.pdf b/docext-test/pdfs/250715/250604_970_2.pdf
new file mode 100644
index 0000000..5e292ba
Binary files /dev/null and b/docext-test/pdfs/250715/250604_970_2.pdf differ