update parsing markdown text

2025-07-22 12:16:51 +08:00
parent abc6450f20
commit 58f91ae65e
1 changed files with 128 additions and 17 deletions
--- a/docext-test/docext_api_test.py
+++ b/docext-test/docext_api_test.py
@@ -29,6 +29,91 @@ import os
 #     except Exception as e:
 #         print(f"提取信息时出错: {e}")
 #         return None
+
+
+markdown_text = """ Page 1 of 1
+|  |  |
+| --- | --- |
+| [QR Code] |  |
+
+**发票信息**
+电子发票(普通发票)
+<signature>国家税务总局厦门市税务局</signature>
+<signature>厦门市税务局</signature>
+
+发票号码: 25947000000028179639
+开票日期: 2025年05月20日
+
+**购买方信息**
+名称: 集美大学
+统一社会信用代码/纳税人识别号: 12350000426600329N
+
+**销售方信息**
+名称: 厦门京东东和贸易有限公司
+统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
+
+<table>
+<thead>
+<tr>
+<th>项目名称</th>
+<th>规格型号</th>
+<th>单位</th>
+<th>数量</th>
+<th>单价</th>
+<th>金额</th>
+<th>税率/征收率</th>
+<th>税额</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
+<td>ROSE-240100C</td>
+<td>个</td>
+<td>2</td>
+<td>23.01</td>
+<td>46.02</td>
+<td>13%</td>
+<td>5.98</td>
+</tr>
+<tr>
+<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td>-7.18</td>
+<td>13%</td>
+<td>-0.93</td>
+</tr>
+<tr>
+<td>合计</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td>￥38.84</td>
+<td></td>
+<td>￥5.05</td>
+</tr>
+<tr>
+<td>价税合计(大写)</td>
+<td>肆拾叁圆捌角玖分</td>
+<td></td>
+<td></td>
+<td></td>
+<td>(小写) ￥43.89</td>
+<td></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
+备 注 订单号:316584139470
+
+开票人: 王梅"""
+
+
 def extract_invoice_info(markdown_text):
    try:
        # 提取发票号码
@@ -36,26 +121,49 @@ def extract_invoice_info(markdown_text):
        if not invoice_match:
            raise ValueError("未找到发票号码信息")
        invoice_number = invoice_match.group(1)
-        
+        print("invoice_number:", invoice_number)
        # 提取销售方名称
        seller_section = markdown_text.split('销售方信息')
        if len(seller_section) < 2:
            raise ValueError("未找到销售方信息部分")
-            
        seller_match = re.search(r'名称:\s*(.*?)\n', seller_section[-1])
        if not seller_match:
            raise ValueError("未找到销售方名称")
        seller_name = seller_match.group(1).strip()
-        
-        # 提取小写金额
+        print("seller_name:", seller_name)
+        # 修正金额正则表达式（移除$符号）
+        # amount_match = re.search(r'小写\s*￥(\d+\.\d+)', markdown_text)
        amount_match = re.search(r'\(小写\)\s*￥(\d+\.\d+)', markdown_text)
+
        if not amount_match:
            raise ValueError("未找到金额信息")
        amount = amount_match.group(1)
+        print("amount:", amount)
+        # 提取商品明细
+        item_section = markdown_text.split('<tbody>')
+        if len(item_section) < 2:
+            raise ValueError("未找到商品明细部分")
+            
+        # 修正表格解析逻辑
+        table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
+        if len(table_rows) < 2:
+            raise ValueError("商品明细数据不完整")
+            
+        # 解析第一行数据
+        first_row = table_rows[1]  # 跳过表头
+        item_name_match = re.search(r'<td>(.*?)</td>', first_row)
+        model_match = re.search(r'<td>([^<]*)</td>', first_row)
+        quantity_match = re.search(r'<td>(\d+)</td>', first_row)
        
+        if not all([item_name_match, model_match, quantity_match]):
+            raise ValueError("商品信息解析失败")
+            
        return {
            "发票号码": invoice_number,
            "销售方名称": seller_name,
+            "项目名称": item_name_match.group(1).strip(),
+            "规格型号": model_match.group(1).strip(),
+            "数量": quantity_match.group(1),
            "金额": amount
        }
        
@@ -63,6 +171,8 @@ def extract_invoice_info(markdown_text):
        print(f"解析发票信息时出错: {str(e)}")
        return None

+
+
 def convert_pdf_to_markdown(
    file_paths: list[str],
    client    
@@ -104,23 +214,24 @@ def get_pdf_files(directory):


 if __name__ == "__main__":
+    extract_invoice_info(markdown_text)

    # Example usage
    # client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
-    CLIENT_URL = "https://61d79ea57016de2c8d.gradio.live/"
-    client = Client(CLIENT_URL, auth=("admin", "admin"))
+    # CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
+    # client = Client(CLIENT_URL, auth=("admin", "admin"))

-    pdf_directory = "pdfs"
+    # pdf_directory = "pdfs"

-    pdf_files = get_pdf_files(pdf_directory)
-    for pdf_file in pdf_files:
-        print(f"Found PDF file: {pdf_file}")
+    # pdf_files = get_pdf_files(pdf_directory)
+    # for pdf_file in pdf_files:
+    #     print(f"Found PDF file: {pdf_file}")

-        # Single image conversion
-        markdown_content = convert_pdf_to_markdown(
-            [pdf_file],client
-        )
+    #     # Single image conversion
+    #     markdown_content = convert_pdf_to_markdown(
+    #         [pdf_file],client
+    #     )

-        # print(markdown_content)
-        invoice_info = extract_invoice_info(markdown_content)
-        print(f"Extracted invoice info: {invoice_info}")
+    #     # print(markdown_content)
+    #     invoice_info = extract_invoice_info(markdown_content)
+    #     print(f"Extracted invoice info: {invoice_info}")