add new pdfs ; test partly passed

2025-07-23 17:53:18 +08:00
parent d2f2240a6c
commit 05cd6ae7b9
19 changed files with 22 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 models/**
-#
+output/**
--- a/docext-test/.gitignore
+++ b/docext-test/.gitignore
@@ -0,0 +1 @@
+output/
--- a/docext-test/docext_api_test.py
+++ b/docext-test/docext_api_test.py
@@ -3,122 +3,16 @@ from gradio_client import Client, handle_file
 import json
 import re
 import os
-# def extract_invoice_info(markdown_text):
-#     try:
-#         # 提取发票号码
-#         invoice_number = re.search(r'发票号码:\s*(\d+)', markdown_text)
-#         if not invoice_number:
-#             raise ValueError("无法提取发票号码")

-#         # 提取销售方名称
-#         seller_section = markdown_text.split('销售方信息')[-1]
-#         seller_name = re.search(r'名称:\s*(.*?)\n', seller_section)
-#         if not seller_name:
-#             raise ValueError("无法提取销售方名称")
-            
-#         # 提取小写金额
-#         amount = re.search(r'\(小写\)\s*￥(\d+\.\d+)', markdown_text)
-#         if not amount:
-#             raise ValueError("无法提取小写金额")
-            
-#         return {
-#             "发票号码": invoice_number.group(1),
-#             "销售方名称": seller_name.group(1).strip(),
-#             "金额": amount.group(1)
-#         }
-#     except Exception as e:
-#         print(f"提取信息时出错: {e}")
-#         return None
-
-
-markdown_text = """ Page 1 of 1
-|  |  |
-| --- | --- |
-| [QR Code] |  |
-
-**发票信息**
-电子发票(普通发票)
-<signature>国家税务总局厦门市税务局</signature>
-<signature>厦门市税务局</signature>
-
-发票号码: 25947000000028179639
-开票日期: 2025年05月20日
-
-**购买方信息**
-名称: 集美大学
-统一社会信用代码/纳税人识别号: 12350000426600329N
-
-**销售方信息**
-名称: 厦门京东东和贸易有限公司
-统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
-
-<table>
-<thead>
-<tr>
-<th>项目名称</th>
-<th>规格型号</th>
-<th>单位</th>
-<th>数量</th>
-<th>单价</th>
-<th>金额</th>
-<th>税率/征收率</th>
-<th>税额</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
-<td>ROSE-240100C</td>
-<td>个</td>
-<td>2</td>
-<td>23.01</td>
-<td>46.02</td>
-<td>13%</td>
-<td>5.98</td>
-</tr>
-<tr>
-<td>*24V2A电源适配器1000mA</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>-7.18</td>
-<td>13%</td>
-<td>-0.93</td>
-</tr>
-<tr>
-<td>合计</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>￥38.84</td>
-<td></td>
-<td>￥5.05</td>
-</tr>
-<tr>
-<td>价税合计(大写)</td>
-<td>肆拾叁圆捌角玖分</td>
-<td></td>
-<td></td>
-<td></td>
-<td>(小写) ￥43.89</td>
-<td></td>
-<td></td>
-</tr>
-</tbody>
-</table>
-
-备 注 订单号:316584139470
-
-开票人: 王梅"""
+def save_to_json(data, filename):
+    """将数据保存为JSON文件"""
+    os.makedirs(os.path.dirname(filename), exist_ok=True)  # 确保目录存在

+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)

 def extract_invoice_info(markdown_text):
    try:
-
-
-
        # 提取发票号码
        invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
        if not invoice_match:
@@ -138,8 +32,6 @@ def extract_invoice_info(markdown_text):
        # amount_match = re.search(r'小写\s*￥(\d+\.\d+)', markdown_text)
        amount_match = re.search(r'\(小写\)\s*￥(\d+\.\d+)', markdown_text)

-
-
        if not amount_match:
            raise ValueError("未找到金额信息")
        amount = amount_match.group(1)
@@ -151,33 +43,23 @@ def extract_invoice_info(markdown_text):
            "total_amount": amount,
            "items": []
        }
-
        # print("amount:", amount)
        # 提取商品明细
        item_section = markdown_text.split('<tbody>')
        if len(item_section) < 2:
            raise ValueError("未找到商品明细部分")          
-
-          
        print("发票号码:", invoice_number)
        print("销售方名称:", seller_name)  
        print("金额:", amount)
-
        # 修正表格解析逻辑
        table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
        if len(table_rows) < 3:
            raise ValueError("商品明细数据不完整")
-
        for row in table_rows[:-2]:
            # print("row:", row)
            item_name_match = re.search(r'<td>(.*?)</td>', row)
            model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', row)
            quantity_match = re.search(r'<td>(\d+)</td>', row)
-            # print("item_name_match:", item_name_match)
-            # print("model_match:", model_match)
-            # print("quantity_match:", quantity_match)
-            # if not all([item_name_match, model_match, quantity_match]):
-            #     raise ValueError("商品信息解析失败")

            if item_name_match is not None:
                print("项目名称:", item_name_match.group(1))
@@ -203,27 +85,7 @@ def extract_invoice_info(markdown_text):
                "quantity": quantity 
            }
            invoice_data["items"].append(item_data) 
-            
-        # # 解析第一行数据
-        # first_row = table_rows[0]  # 跳过表头
-        # # print("first_row:", first_row)
-        # item_name_match = re.search(r'<td>(.*?)</td>', first_row)
-        # model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', first_row)
-        # quantity_match = re.search(r'<td>(\d+)</td>', first_row)
-        # # print("item_name_match:", item_name_match)
-        # # print("model_match:", model_match)
-        # # print("quantity_match:", quantity_match)
-        # if not all([item_name_match, model_match, quantity_match]):
-        #     raise ValueError("商品信息解析失败")
-
-        # print("发票号码:", invoice_number)
-        # print("销售方名称:", seller_name)
-        # print("项目名称:", item_name_match.group(1))
-        # print("规格型号:", model_match.group(1))
-        # print("数量:", quantity_match.group(1))
-        # print("金额:", amount)    
        return invoice_data
-
    except Exception as e:
        print(f"解析发票信息时出错: {str(e)}")
        return None
@@ -254,10 +116,8 @@ def convert_pdf_to_markdown(
        images=file_inputs,
        api_name="/process_markdown_streaming"
    )
-
    return result

-
 def get_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
@@ -266,8 +126,6 @@ def get_pdf_files(directory):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

-
-
 if __name__ == "__main__":
    # # test extract_invoice_info function
    # info = extract_invoice_info(markdown_text)
@@ -275,21 +133,26 @@ if __name__ == "__main__":

    # Example usage
    # client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
-    CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
+    CLIENT_URL = "http://172.29.57.6:9998/"
    client = Client(CLIENT_URL, auth=("admin", "admin"))
-
    pdf_directory = "pdfs"
-
+    output_dir = "output"
    pdf_files = get_pdf_files(pdf_directory)
    print(pdf_files)
    for pdf_file in pdf_files:
        print(f"Found PDF file: {pdf_file}")
-
        # Single image conversion
-        markdown_content = convert_pdf_to_markdown(
-            [pdf_file],client
-        )
-
+        markdown_content = convert_pdf_to_markdown([pdf_file],client)
        # print(markdown_content)
        invoice_info = extract_invoice_info(markdown_content)
        print(f"Extracted invoice info: {invoice_info}")
+        if invoice_info:
+            # 生成输出文件名
+            base_name = os.path.splitext(os.path.basename(pdf_file))[0]
+            print(f"Base name: {base_name}")
+            json_file = os.path.join(output_dir, f"{base_name}.json")
+            print(f"JSON file path: {json_file}")
+
+            # 保存为JSON
+            save_to_json(invoice_info, json_file)
+            print(f"发票信息已保存到: {json_file}")
--- a/docext-test/pdfs/250407/250224_21.76.pdf
+++ b/docext-test/pdfs/250407/250224_21.76.pdf
--- a/docext-test/pdfs/250407/250308_53.47.pdf
+++ b/docext-test/pdfs/250407/250308_53.47.pdf
--- a/docext-test/pdfs/250407/250312_21.00.pdf
+++ b/docext-test/pdfs/250407/250312_21.00.pdf
--- a/docext-test/pdfs/250407/250313_80.00.pdf
+++ b/docext-test/pdfs/250407/250313_80.00.pdf
--- a/docext-test/pdfs/250407/250317_63.50.pdf
+++ b/docext-test/pdfs/250407/250317_63.50.pdf
--- a/docext-test/pdfs/250407/250318_70.06.pdf
+++ b/docext-test/pdfs/250407/250318_70.06.pdf
--- a/docext-test/pdfs/250407/250321_26.49.pdf
+++ b/docext-test/pdfs/250407/250321_26.49.pdf
--- a/docext-test/pdfs/250715/250307_78.36.pdf
+++ b/docext-test/pdfs/250715/250307_78.36.pdf
--- a/docext-test/pdfs/250715/250308_53.47.pdf
+++ b/docext-test/pdfs/250715/250308_53.47.pdf
--- a/docext-test/pdfs/250715/250309_4.81.pdf
+++ b/docext-test/pdfs/250715/250309_4.81.pdf
--- a/docext-test/pdfs/250715/250312_49.18.pdf
+++ b/docext-test/pdfs/250715/250312_49.18.pdf
--- a/docext-test/pdfs/250715/250405_5.60.pdf
+++ b/docext-test/pdfs/250715/250405_5.60.pdf
--- a/docext-test/pdfs/250715/250516_11.50.pdf
+++ b/docext-test/pdfs/250715/250516_11.50.pdf
--- a/docext-test/pdfs/250715/250520_43.89.pdf
+++ b/docext-test/pdfs/250715/250520_43.89.pdf
--- a/docext-test/pdfs/250715/250604_970.pdf
+++ b/docext-test/pdfs/250715/250604_970.pdf
--- a/docext-test/pdfs/250715/250604_970_2.pdf
+++ b/docext-test/pdfs/250715/250604_970_2.pdf