diff --git a/docext-test/docext_api_test.py b/docext-test/docext_api_test.py index 29238ff..8ff094d 100644 --- a/docext-test/docext_api_test.py +++ b/docext-test/docext_api_test.py @@ -29,6 +29,91 @@ import os # except Exception as e: # print(f"提取信息时出错: {e}") # return None + + +markdown_text = """ Page 1 of 1 +| | | +| --- | --- | +| [QR Code] | | + +**发票信息** +电子发票(普通发票) +国家税务总局厦门市税务局 +厦门市税务局 + +发票号码: 25947000000028179639 +开票日期: 2025年05月20日 + +**购买方信息** +名称: 集美大学 +统一社会信用代码/纳税人识别号: 12350000426600329N + +**销售方信息** +名称: 厦门京东东和贸易有限公司 +统一社会信用代码/纳税人识别号: 91350212MA34A9L25L + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
项目名称规格型号单位数量单价金额税率/征收率税额
*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线ROSE-240100C223.0146.0213%5.98
*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线-7.1813%-0.93
合计¥38.84¥5.05
价税合计(大写)肆拾叁圆捌角玖分(小写) ¥43.89
+ +备 注 订单号:316584139470 + +开票人: 王梅""" + + def extract_invoice_info(markdown_text): try: # 提取发票号码 @@ -36,26 +121,49 @@ def extract_invoice_info(markdown_text): if not invoice_match: raise ValueError("未找到发票号码信息") invoice_number = invoice_match.group(1) - + print("invoice_number:", invoice_number) # 提取销售方名称 seller_section = markdown_text.split('销售方信息') if len(seller_section) < 2: raise ValueError("未找到销售方信息部分") - seller_match = re.search(r'名称:\s*(.*?)\n', seller_section[-1]) if not seller_match: raise ValueError("未找到销售方名称") seller_name = seller_match.group(1).strip() - - # 提取小写金额 + print("seller_name:", seller_name) + # 修正金额正则表达式(移除$符号) + # amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text) amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text) + if not amount_match: raise ValueError("未找到金额信息") amount = amount_match.group(1) + print("amount:", amount) + # 提取商品明细 + item_section = markdown_text.split('') + if len(item_section) < 2: + raise ValueError("未找到商品明细部分") + + # 修正表格解析逻辑 + table_rows = re.findall(r'.*?', item_section[1],re.DOTALL) + if len(table_rows) < 2: + raise ValueError("商品明细数据不完整") + + # 解析第一行数据 + first_row = table_rows[1] # 跳过表头 + item_name_match = re.search(r'(.*?)', first_row) + model_match = re.search(r'([^<]*)', first_row) + quantity_match = re.search(r'(\d+)', first_row) + if not all([item_name_match, model_match, quantity_match]): + raise ValueError("商品信息解析失败") + return { "发票号码": invoice_number, "销售方名称": seller_name, + "项目名称": item_name_match.group(1).strip(), + "规格型号": model_match.group(1).strip(), + "数量": quantity_match.group(1), "金额": amount } @@ -63,6 +171,8 @@ def extract_invoice_info(markdown_text): print(f"解析发票信息时出错: {str(e)}") return None + + def convert_pdf_to_markdown( file_paths: list[str], client @@ -104,23 +214,24 @@ def get_pdf_files(directory): if __name__ == "__main__": + extract_invoice_info(markdown_text) # Example usage # client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/` - CLIENT_URL = "https://61d79ea57016de2c8d.gradio.live/" - client = Client(CLIENT_URL, auth=("admin", "admin")) + # CLIENT_URL = "https://fceec28e477468b094.gradio.live/" + # client = Client(CLIENT_URL, auth=("admin", "admin")) - pdf_directory = "pdfs" + # pdf_directory = "pdfs" - pdf_files = get_pdf_files(pdf_directory) - for pdf_file in pdf_files: - print(f"Found PDF file: {pdf_file}") + # pdf_files = get_pdf_files(pdf_directory) + # for pdf_file in pdf_files: + # print(f"Found PDF file: {pdf_file}") - # Single image conversion - markdown_content = convert_pdf_to_markdown( - [pdf_file],client - ) + # # Single image conversion + # markdown_content = convert_pdf_to_markdown( + # [pdf_file],client + # ) - # print(markdown_content) - invoice_info = extract_invoice_info(markdown_content) - print(f"Extracted invoice info: {invoice_info}") + # # print(markdown_content) + # invoice_info = extract_invoice_info(markdown_content) + # print(f"Extracted invoice info: {invoice_info}")