diff --git a/.gitignore b/.gitignore index 4e8ff1b..270567b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ models/** -# \ No newline at end of file +output/** \ No newline at end of file diff --git a/docext-test/.gitignore b/docext-test/.gitignore index e69de29..9b1960e 100644 --- a/docext-test/.gitignore +++ b/docext-test/.gitignore @@ -0,0 +1 @@ +output/ \ No newline at end of file diff --git a/docext-test/docext_api_test.py b/docext-test/docext_api_test.py index acf9f1c..41d0517 100644 --- a/docext-test/docext_api_test.py +++ b/docext-test/docext_api_test.py @@ -3,122 +3,16 @@ from gradio_client import Client, handle_file import json import re import os -# def extract_invoice_info(markdown_text): -# try: -# # 提取发票号码 -# invoice_number = re.search(r'发票号码:\s*(\d+)', markdown_text) -# if not invoice_number: -# raise ValueError("无法提取发票号码") - -# # 提取销售方名称 -# seller_section = markdown_text.split('销售方信息')[-1] -# seller_name = re.search(r'名称:\s*(.*?)\n', seller_section) -# if not seller_name: -# raise ValueError("无法提取销售方名称") - -# # 提取小写金额 -# amount = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text) -# if not amount: -# raise ValueError("无法提取小写金额") - -# return { -# "发票号码": invoice_number.group(1), -# "销售方名称": seller_name.group(1).strip(), -# "金额": amount.group(1) -# } -# except Exception as e: -# print(f"提取信息时出错: {e}") -# return None +def save_to_json(data, filename): + """将数据保存为JSON文件""" + os.makedirs(os.path.dirname(filename), exist_ok=True) # 确保目录存在 -markdown_text = """ Page 1 of 1 -| | | -| --- | --- | -| [QR Code] | | - -**发票信息** -电子发票(普通发票) -国家税务总局厦门市税务局 -厦门市税务局 - -发票号码: 25947000000028179639 -开票日期: 2025年05月20日 - -**购买方信息** -名称: 集美大学 -统一社会信用代码/纳税人识别号: 12350000426600329N - -**销售方信息** -名称: 厦门京东东和贸易有限公司 -统一社会信用代码/纳税人识别号: 91350212MA34A9L25L - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
项目名称规格型号单位数量单价金额税率/征收率税额
*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线ROSE-240100C223.0146.0213%5.98
*24V2A电源适配器1000mA-7.1813%-0.93
合计¥38.84¥5.05
价税合计(大写)肆拾叁圆捌角玖分(小写) ¥43.89
- -备 注 订单号:316584139470 - -开票人: 王梅""" - + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) def extract_invoice_info(markdown_text): try: - - - # 提取发票号码 invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text) if not invoice_match: @@ -138,8 +32,6 @@ def extract_invoice_info(markdown_text): # amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text) amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text) - - if not amount_match: raise ValueError("未找到金额信息") amount = amount_match.group(1) @@ -151,33 +43,23 @@ def extract_invoice_info(markdown_text): "total_amount": amount, "items": [] } - # print("amount:", amount) # 提取商品明细 item_section = markdown_text.split('') if len(item_section) < 2: - raise ValueError("未找到商品明细部分") - - + raise ValueError("未找到商品明细部分") print("发票号码:", invoice_number) print("销售方名称:", seller_name) print("金额:", amount) - # 修正表格解析逻辑 table_rows = re.findall(r'.*?', item_section[1],re.DOTALL) if len(table_rows) < 3: raise ValueError("商品明细数据不完整") - for row in table_rows[:-2]: # print("row:", row) item_name_match = re.search(r'(.*?)', row) model_match = re.search(r']*>.*?\s*]*>(.*?)', row) quantity_match = re.search(r'(\d+)', row) - # print("item_name_match:", item_name_match) - # print("model_match:", model_match) - # print("quantity_match:", quantity_match) - # if not all([item_name_match, model_match, quantity_match]): - # raise ValueError("商品信息解析失败") if item_name_match is not None: print("项目名称:", item_name_match.group(1)) @@ -202,28 +84,8 @@ def extract_invoice_info(markdown_text): "model": model, "quantity": quantity } - invoice_data["items"].append(item_data) - - # # 解析第一行数据 - # first_row = table_rows[0] # 跳过表头 - # # print("first_row:", first_row) - # item_name_match = re.search(r'(.*?)', first_row) - # model_match = re.search(r']*>.*?\s*]*>(.*?)', first_row) - # quantity_match = re.search(r'(\d+)', first_row) - # # print("item_name_match:", item_name_match) - # # print("model_match:", model_match) - # # print("quantity_match:", quantity_match) - # if not all([item_name_match, model_match, quantity_match]): - # raise ValueError("商品信息解析失败") - - # print("发票号码:", invoice_number) - # print("销售方名称:", seller_name) - # print("项目名称:", item_name_match.group(1)) - # print("规格型号:", model_match.group(1)) - # print("数量:", quantity_match.group(1)) - # print("金额:", amount) + invoice_data["items"].append(item_data) return invoice_data - except Exception as e: print(f"解析发票信息时出错: {str(e)}") return None @@ -254,10 +116,8 @@ def convert_pdf_to_markdown( images=file_inputs, api_name="/process_markdown_streaming" ) - return result - def get_pdf_files(directory): pdf_files = [] for root, dirs, files in os.walk(directory): @@ -266,8 +126,6 @@ def get_pdf_files(directory): pdf_files.append(os.path.join(root, file)) return pdf_files - - if __name__ == "__main__": # # test extract_invoice_info function # info = extract_invoice_info(markdown_text) @@ -275,21 +133,26 @@ if __name__ == "__main__": # Example usage # client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/` - CLIENT_URL = "https://fceec28e477468b094.gradio.live/" + CLIENT_URL = "http://172.29.57.6:9998/" client = Client(CLIENT_URL, auth=("admin", "admin")) - pdf_directory = "pdfs" - + output_dir = "output" pdf_files = get_pdf_files(pdf_directory) print(pdf_files) for pdf_file in pdf_files: print(f"Found PDF file: {pdf_file}") - # Single image conversion - markdown_content = convert_pdf_to_markdown( - [pdf_file],client - ) - + markdown_content = convert_pdf_to_markdown([pdf_file],client) # print(markdown_content) invoice_info = extract_invoice_info(markdown_content) print(f"Extracted invoice info: {invoice_info}") + if invoice_info: + # 生成输出文件名 + base_name = os.path.splitext(os.path.basename(pdf_file))[0] + print(f"Base name: {base_name}") + json_file = os.path.join(output_dir, f"{base_name}.json") + print(f"JSON file path: {json_file}") + + # 保存为JSON + save_to_json(invoice_info, json_file) + print(f"发票信息已保存到: {json_file}") diff --git a/docext-test/pdfs/250407/250224_21.76.pdf b/docext-test/pdfs/250407/250224_21.76.pdf new file mode 100644 index 0000000..bc00336 Binary files /dev/null and b/docext-test/pdfs/250407/250224_21.76.pdf differ diff --git a/docext-test/pdfs/250407/250308_53.47.pdf b/docext-test/pdfs/250407/250308_53.47.pdf new file mode 100644 index 0000000..3d5bf80 Binary files /dev/null and b/docext-test/pdfs/250407/250308_53.47.pdf differ diff --git a/docext-test/pdfs/250407/250312_21.00.pdf b/docext-test/pdfs/250407/250312_21.00.pdf new file mode 100644 index 0000000..6cd7099 Binary files /dev/null and b/docext-test/pdfs/250407/250312_21.00.pdf differ diff --git a/docext-test/pdfs/250407/250313_80.00.pdf b/docext-test/pdfs/250407/250313_80.00.pdf new file mode 100644 index 0000000..5ba4077 Binary files /dev/null and b/docext-test/pdfs/250407/250313_80.00.pdf differ diff --git a/docext-test/pdfs/250407/250317_63.50.pdf b/docext-test/pdfs/250407/250317_63.50.pdf new file mode 100644 index 0000000..0140a33 Binary files /dev/null and b/docext-test/pdfs/250407/250317_63.50.pdf differ diff --git a/docext-test/pdfs/250407/250318_70.06.pdf b/docext-test/pdfs/250407/250318_70.06.pdf new file mode 100644 index 0000000..5d02024 Binary files /dev/null and b/docext-test/pdfs/250407/250318_70.06.pdf differ diff --git a/docext-test/pdfs/250407/250321_26.49.pdf b/docext-test/pdfs/250407/250321_26.49.pdf new file mode 100644 index 0000000..a3a2a24 Binary files /dev/null and b/docext-test/pdfs/250407/250321_26.49.pdf differ diff --git a/docext-test/pdfs/250715/250307_78.36.pdf b/docext-test/pdfs/250715/250307_78.36.pdf new file mode 100644 index 0000000..33f5a59 Binary files /dev/null and b/docext-test/pdfs/250715/250307_78.36.pdf differ diff --git a/docext-test/pdfs/250715/250308_53.47.pdf b/docext-test/pdfs/250715/250308_53.47.pdf new file mode 100644 index 0000000..3d5bf80 Binary files /dev/null and b/docext-test/pdfs/250715/250308_53.47.pdf differ diff --git a/docext-test/pdfs/250715/250309_4.81.pdf b/docext-test/pdfs/250715/250309_4.81.pdf new file mode 100644 index 0000000..e153545 Binary files /dev/null and b/docext-test/pdfs/250715/250309_4.81.pdf differ diff --git a/docext-test/pdfs/250715/250312_49.18.pdf b/docext-test/pdfs/250715/250312_49.18.pdf new file mode 100644 index 0000000..93f3883 Binary files /dev/null and b/docext-test/pdfs/250715/250312_49.18.pdf differ diff --git a/docext-test/pdfs/250715/250405_5.60.pdf b/docext-test/pdfs/250715/250405_5.60.pdf new file mode 100644 index 0000000..b994bfb Binary files /dev/null and b/docext-test/pdfs/250715/250405_5.60.pdf differ diff --git a/docext-test/pdfs/250715/250516_11.50.pdf b/docext-test/pdfs/250715/250516_11.50.pdf new file mode 100644 index 0000000..52eb0be Binary files /dev/null and b/docext-test/pdfs/250715/250516_11.50.pdf differ diff --git a/docext-test/pdfs/250715/250520_43.89.pdf b/docext-test/pdfs/250715/250520_43.89.pdf new file mode 100644 index 0000000..ae933e2 Binary files /dev/null and b/docext-test/pdfs/250715/250520_43.89.pdf differ diff --git a/docext-test/pdfs/250715/250604_970.pdf b/docext-test/pdfs/250715/250604_970.pdf new file mode 100644 index 0000000..0117552 Binary files /dev/null and b/docext-test/pdfs/250715/250604_970.pdf differ diff --git a/docext-test/pdfs/250715/250604_970_2.pdf b/docext-test/pdfs/250715/250604_970_2.pdf new file mode 100644 index 0000000..5e292ba Binary files /dev/null and b/docext-test/pdfs/250715/250604_970_2.pdf differ