update parsing markdown
This commit is contained in:
@@ -77,7 +77,7 @@ markdown_text = """ Page 1 of 1
|
|||||||
<td>5.98</td>
|
<td>5.98</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
|
<td>*24V2A电源适配器1000mA</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
<td></td>
|
<td></td>
|
||||||
<td></td>
|
<td></td>
|
||||||
@@ -116,12 +116,15 @@ markdown_text = """ Page 1 of 1
|
|||||||
|
|
||||||
def extract_invoice_info(markdown_text):
|
def extract_invoice_info(markdown_text):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 提取发票号码
|
# 提取发票号码
|
||||||
invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
|
invoice_match = re.search(r'发票号码:\s*(\d+)', markdown_text)
|
||||||
if not invoice_match:
|
if not invoice_match:
|
||||||
raise ValueError("未找到发票号码信息")
|
raise ValueError("未找到发票号码信息")
|
||||||
invoice_number = invoice_match.group(1)
|
invoice_number = invoice_match.group(1)
|
||||||
print("invoice_number:", invoice_number)
|
# print("invoice_number:", invoice_number)
|
||||||
# 提取销售方名称
|
# 提取销售方名称
|
||||||
seller_section = markdown_text.split('销售方信息')
|
seller_section = markdown_text.split('销售方信息')
|
||||||
if len(seller_section) < 2:
|
if len(seller_section) < 2:
|
||||||
@@ -130,49 +133,101 @@ def extract_invoice_info(markdown_text):
|
|||||||
if not seller_match:
|
if not seller_match:
|
||||||
raise ValueError("未找到销售方名称")
|
raise ValueError("未找到销售方名称")
|
||||||
seller_name = seller_match.group(1).strip()
|
seller_name = seller_match.group(1).strip()
|
||||||
print("seller_name:", seller_name)
|
# print("seller_name:", seller_name)
|
||||||
# 修正金额正则表达式(移除$符号)
|
# 修正金额正则表达式(移除$符号)
|
||||||
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
|
# amount_match = re.search(r'小写\s*¥(\d+\.\d+)', markdown_text)
|
||||||
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
|
amount_match = re.search(r'\(小写\)\s*¥(\d+\.\d+)', markdown_text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if not amount_match:
|
if not amount_match:
|
||||||
raise ValueError("未找到金额信息")
|
raise ValueError("未找到金额信息")
|
||||||
amount = amount_match.group(1)
|
amount = amount_match.group(1)
|
||||||
print("amount:", amount)
|
|
||||||
|
# 构建基础数据
|
||||||
|
invoice_data = {
|
||||||
|
"invoice_number": invoice_number,
|
||||||
|
"seller_name": seller_name,
|
||||||
|
"total_amount": amount,
|
||||||
|
"items": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# print("amount:", amount)
|
||||||
# 提取商品明细
|
# 提取商品明细
|
||||||
item_section = markdown_text.split('<tbody>')
|
item_section = markdown_text.split('<tbody>')
|
||||||
if len(item_section) < 2:
|
if len(item_section) < 2:
|
||||||
raise ValueError("未找到商品明细部分")
|
raise ValueError("未找到商品明细部分")
|
||||||
|
|
||||||
|
|
||||||
|
print("发票号码:", invoice_number)
|
||||||
|
print("销售方名称:", seller_name)
|
||||||
|
print("金额:", amount)
|
||||||
|
|
||||||
# 修正表格解析逻辑
|
# 修正表格解析逻辑
|
||||||
table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
|
table_rows = re.findall(r'<tr>.*?</tr>', item_section[1],re.DOTALL)
|
||||||
if len(table_rows) < 2:
|
if len(table_rows) < 3:
|
||||||
raise ValueError("商品明细数据不完整")
|
raise ValueError("商品明细数据不完整")
|
||||||
|
|
||||||
|
for row in table_rows[:-2]:
|
||||||
|
# print("row:", row)
|
||||||
|
item_name_match = re.search(r'<td>(.*?)</td>', row)
|
||||||
|
model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', row)
|
||||||
|
quantity_match = re.search(r'<td>(\d+)</td>', row)
|
||||||
|
# print("item_name_match:", item_name_match)
|
||||||
|
# print("model_match:", model_match)
|
||||||
|
# print("quantity_match:", quantity_match)
|
||||||
|
# if not all([item_name_match, model_match, quantity_match]):
|
||||||
|
# raise ValueError("商品信息解析失败")
|
||||||
|
|
||||||
|
if item_name_match is not None:
|
||||||
|
print("项目名称:", item_name_match.group(1))
|
||||||
|
item_name = item_name_match.group(1)
|
||||||
|
else:
|
||||||
|
print("项目名称:", "无")
|
||||||
|
item_name = "无"
|
||||||
|
if model_match is not None:
|
||||||
|
print("规格型号:", model_match.group(1))
|
||||||
|
model = model_match.group(1)
|
||||||
|
else:
|
||||||
|
print("规格型号:", "无")
|
||||||
|
model = "无"
|
||||||
|
if quantity_match is not None:
|
||||||
|
print("数量:", quantity_match.group(1))
|
||||||
|
quantity = quantity_match.group(1)
|
||||||
|
else:
|
||||||
|
print("数量:", "无")
|
||||||
|
quantity = "无"
|
||||||
|
item_data = {
|
||||||
|
"name": item_name,
|
||||||
|
"model": model,
|
||||||
|
"quantity": quantity
|
||||||
|
}
|
||||||
|
invoice_data["items"].append(item_data)
|
||||||
|
|
||||||
# 解析第一行数据
|
# # 解析第一行数据
|
||||||
first_row = table_rows[1] # 跳过表头
|
# first_row = table_rows[0] # 跳过表头
|
||||||
item_name_match = re.search(r'<td>(.*?)</td>', first_row)
|
# # print("first_row:", first_row)
|
||||||
model_match = re.search(r'<td>([^<]*)</td>', first_row)
|
# item_name_match = re.search(r'<td>(.*?)</td>', first_row)
|
||||||
quantity_match = re.search(r'<td>(\d+)</td>', first_row)
|
# model_match = re.search(r'<td[^>]*>.*?</td>\s*<td[^>]*>(.*?)</td>', first_row)
|
||||||
|
# quantity_match = re.search(r'<td>(\d+)</td>', first_row)
|
||||||
if not all([item_name_match, model_match, quantity_match]):
|
# # print("item_name_match:", item_name_match)
|
||||||
raise ValueError("商品信息解析失败")
|
# # print("model_match:", model_match)
|
||||||
|
# # print("quantity_match:", quantity_match)
|
||||||
return {
|
# if not all([item_name_match, model_match, quantity_match]):
|
||||||
"发票号码": invoice_number,
|
# raise ValueError("商品信息解析失败")
|
||||||
"销售方名称": seller_name,
|
|
||||||
"项目名称": item_name_match.group(1).strip(),
|
# print("发票号码:", invoice_number)
|
||||||
"规格型号": model_match.group(1).strip(),
|
# print("销售方名称:", seller_name)
|
||||||
"数量": quantity_match.group(1),
|
# print("项目名称:", item_name_match.group(1))
|
||||||
"金额": amount
|
# print("规格型号:", model_match.group(1))
|
||||||
}
|
# print("数量:", quantity_match.group(1))
|
||||||
|
# print("金额:", amount)
|
||||||
|
return invoice_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"解析发票信息时出错: {str(e)}")
|
print(f"解析发票信息时出错: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convert_pdf_to_markdown(
|
def convert_pdf_to_markdown(
|
||||||
file_paths: list[str],
|
file_paths: list[str],
|
||||||
client
|
client
|
||||||
@@ -214,24 +269,27 @@ def get_pdf_files(directory):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
extract_invoice_info(markdown_text)
|
# # test extract_invoice_info function
|
||||||
|
# info = extract_invoice_info(markdown_text)
|
||||||
|
# print("Extracted invoice info:", info)
|
||||||
|
|
||||||
# Example usage
|
# Example usage
|
||||||
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
|
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
|
||||||
# CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
|
CLIENT_URL = "https://fceec28e477468b094.gradio.live/"
|
||||||
# client = Client(CLIENT_URL, auth=("admin", "admin"))
|
client = Client(CLIENT_URL, auth=("admin", "admin"))
|
||||||
|
|
||||||
# pdf_directory = "pdfs"
|
pdf_directory = "pdfs"
|
||||||
|
|
||||||
# pdf_files = get_pdf_files(pdf_directory)
|
pdf_files = get_pdf_files(pdf_directory)
|
||||||
# for pdf_file in pdf_files:
|
print(pdf_files)
|
||||||
# print(f"Found PDF file: {pdf_file}")
|
for pdf_file in pdf_files:
|
||||||
|
print(f"Found PDF file: {pdf_file}")
|
||||||
|
|
||||||
# # Single image conversion
|
# Single image conversion
|
||||||
# markdown_content = convert_pdf_to_markdown(
|
markdown_content = convert_pdf_to_markdown(
|
||||||
# [pdf_file],client
|
[pdf_file],client
|
||||||
# )
|
)
|
||||||
|
|
||||||
# # print(markdown_content)
|
# print(markdown_content)
|
||||||
# invoice_info = extract_invoice_info(markdown_content)
|
invoice_info = extract_invoice_info(markdown_content)
|
||||||
# print(f"Extracted invoice info: {invoice_info}")
|
print(f"Extracted invoice info: {invoice_info}")
|
||||||
|
|||||||
81
mk.md
Normal file
81
mk.md
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
Page 1 of 1
|
||||||
|
| | |
|
||||||
|
| --- | --- |
|
||||||
|
| [QR Code] | |
|
||||||
|
|
||||||
|
**发票信息**
|
||||||
|
电子发票(普通发票)
|
||||||
|
<signature>国家税务总局厦门市税务局</signature>
|
||||||
|
<signature>厦门市税务局</signature>
|
||||||
|
|
||||||
|
发票号码: 25947000000028179639
|
||||||
|
开票日期: 2025年05月20日
|
||||||
|
|
||||||
|
**购买方信息**
|
||||||
|
名称: 集美大学
|
||||||
|
统一社会信用代码/纳税人识别号: 12350000426600329N
|
||||||
|
|
||||||
|
**销售方信息**
|
||||||
|
名称: 厦门京东东和贸易有限公司
|
||||||
|
统一社会信用代码/纳税人识别号: 91350212MA34A9L25L
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>项目名称</th>
|
||||||
|
<th>规格型号</th>
|
||||||
|
<th>单位</th>
|
||||||
|
<th>数量</th>
|
||||||
|
<th>单价</th>
|
||||||
|
<th>金额</th>
|
||||||
|
<th>税率/征收率</th>
|
||||||
|
<th>税额</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
|
||||||
|
<td>ROSE-240100C</td>
|
||||||
|
<td>个</td>
|
||||||
|
<td>2</td>
|
||||||
|
<td>23.01</td>
|
||||||
|
<td>46.02</td>
|
||||||
|
<td>13%</td>
|
||||||
|
<td>5.98</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>*计算机配套产品*金骏芦宝 24V1A电源适配器1000mA适用于按摩器甩脂机瘦身腰带LED台灯吸尘器扫地机器人加湿器充电器电源线</td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>-7.18</td>
|
||||||
|
<td>13%</td>
|
||||||
|
<td>-0.93</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>合计</td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>¥38.84</td>
|
||||||
|
<td></td>
|
||||||
|
<td>¥5.05</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>价税合计(大写)</td>
|
||||||
|
<td>肆拾叁圆捌角玖分</td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>(小写) ¥43.89</td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
备 注 订单号:316584139470
|
||||||
|
|
||||||
|
开票人: 王梅
|
||||||
Reference in New Issue
Block a user