update parsing markdown text

This commit is contained in:
2025-07-22 12:16:51 +08:00
parent abc6450f20
commit 58f91ae65e

View File

@@ -29,6 +29,91 @@ import os
# except Exception as e: # except Exception as e:
# print(f"提取信息时出错: {e}") # print(f"提取信息时出错: {e}")
# return None # return None
markdown_text = """ Page 1 of 1
| | |
| --- | --- |
| [QR Code] | |
**发票信息**
电子发票(普通发票)
<signature>国家税务总局厦门市税务局</signature>
<signature>厦门市税务局</signature>
发票号码: 25947000000028179639
开票日期: 2025年05月20日
**购买方信息**
名称: 集美大学
统一社会信用代码/纳税人识别号: 12350000426600329N
**销售方信息**
名称: 厦门京东东和贸易有限公司
统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
<table>
<thead>
<tr>
<th>项目名称</th>
<th>规格型号</th>
<th>单位</th>
<th>数量</th>
<th>单价</th>
<th>金额</th>
<th>税率/征收率</th>
<th>税额</th>
</tr>
</thead>
<tbody>
<tr>
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
<td>ROSE-240100C</td>
<td>个</td>
<td>2</td>
<td>23.01</td>
<td>46.02</td>
<td>13%</td>
<td>5.98</td>
</tr>
<tr>
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>-7.18</td>
<td>13%</td>
<td>-0.93</td>
</tr>
<tr>
<td>合计</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>¥38.84</td>
<td></td>
<td>¥5.05</td>
</tr>
<tr>
<td>价税合计(大写)</td>
<td>肆拾叁圆捌角玖分</td>
<td></td>
<td></td>
<td></td>
<td>(小写) ¥43.89</td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
备 注 订单号:316584139470
开票人: 王梅"""
def extract_invoice_info(markdown_text): def extract_invoice_info(markdown_text):
try: try:
# 提取发票号码 # 提取发票号码
@@ -36,26 +121,49 @@ def extract_invoice_info(markdown_text):
if not invoice_match: if not invoice_match:
raise ValueError("未找到发票号码信息") raise ValueError("未找到发票号码信息")
invoice_number = invoice_match.group(1) invoice_number = invoice_match.group(1)
print("invoice_number:", invoice_number)
# 提取销售方名称 # 提取销售方名称
seller_section = markdown_text.split('销售方信息') seller_section = markdown_text.split('销售方信息')
if len(seller_section) < 2: if len(seller_section) < 2:
raise ValueError("未找到销售方信息部分") raise ValueError("未找到销售方信息部分")
seller_match = re.search(r'名称:\s*(.*?)\n', seller_section[-1]) seller_match = re.search(r'名称:\s*(.*?)\n', seller_section[-1])
if not seller_match: if not seller_match:
raise ValueError("未找到销售方名称") raise ValueError("未找到销售方名称")
seller_name = seller_match.group(1).strip() seller_name = seller_match.group(1).strip()
print("seller_name:", seller_name)
# 提取小写金额 # 修正金额正则表达式(移除$符号)
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text) amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
if not amount_match: if not amount_match:
raise ValueError("未找到金额信息") raise ValueError("未找到金额信息")
amount = amount_match.group(1) amount = amount_match.group(1)
print("amount:", amount)
# 提取商品明细
item_section = markdown_text.split('<tbody>')
if len(item_section) < 2:
raise ValueError("未找到商品明细部分")
# 修正表格解析逻辑
table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
if len(table_rows) < 2:
raise ValueError("商品明细数据不完整")
# 解析第一行数据
first_row = table_rows[1] # 跳过表头
item_name_match = re.search(r'<td>(.*?)</td>', first_row)
model_match = re.search(r'<td>([^<]*)</td>', first_row)
quantity_match = re.search(r'<td>(\d+)</td>', first_row)
if not all([item_name_match, model_match, quantity_match]):
raise ValueError("商品信息解析失败")
return { return {
"发票号码": invoice_number, "发票号码": invoice_number,
"销售方名称": seller_name, "销售方名称": seller_name,
"项目名称": item_name_match.group(1).strip(),
"规格型号": model_match.group(1).strip(),
"数量": quantity_match.group(1),
"金额": amount "金额": amount
} }
@@ -63,6 +171,8 @@ def extract_invoice_info(markdown_text):
print(f"解析发票信息时出错: {str(e)}") print(f"解析发票信息时出错: {str(e)}")
return None return None
def convert_pdf_to_markdown( def convert_pdf_to_markdown(
file_paths: list[str], file_paths: list[str],
client client
@@ -104,23 +214,24 @@ def get_pdf_files(directory):
if __name__ == "__main__": if __name__ == "__main__":
extract_invoice_info(markdown_text)
# Example usage # Example usage
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/` # client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
CLIENT_URL = "https://61d79ea57016de2c8d.gradio.live/" # CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
client = Client(CLIENT_URL, auth=("admin", "admin")) # client = Client(CLIENT_URL, auth=("admin", "admin"))
pdf_directory = "pdfs" # pdf_directory = "pdfs"
pdf_files = get_pdf_files(pdf_directory) # pdf_files = get_pdf_files(pdf_directory)
for pdf_file in pdf_files: # for pdf_file in pdf_files:
print(f"Found PDF file: {pdf_file}") # print(f"Found PDF file: {pdf_file}")
# Single image conversion # # Single image conversion
markdown_content = convert_pdf_to_markdown( # markdown_content = convert_pdf_to_markdown(
[pdf_file],client # [pdf_file],client
) # )
# print(markdown_content) # # print(markdown_content)
invoice_info = extract_invoice_info(markdown_content) # invoice_info = extract_invoice_info(markdown_content)
print(f"Extracted invoice info: {invoice_info}") # print(f"Extracted invoice info: {invoice_info}")