当前位置：首页 > news >正文

PDF论文文字公式提取,翻译与对照代码(自用)

news 2025/9/25 14:05:58

代码1:

import redef process_markdown_file(input_file, output_file):# 步骤1: 读取文件并存储为[正文,标签]格式的列表lines = []with open(input_file, 'r', encoding='utf-8') as f:for line in f:content = line.rstrip('\n\r')  # 过滤行末回车if content.strip():  # 过滤空白行lines.append([content, None])# 步骤2: 处理<!--DoNotTouchBelow-->与<!--DoNotTouchAbove-->之间的内容i = 0while i < len(lines):if lines[i][0] == "<!--DoNotTouchBelow-->":start = i# 查找结束标记while i < len(lines) and lines[i][0] != "<!--DoNotTouchAbove-->":i += 1if i < len(lines):  # 找到结束标记# 合并范围内的内容merged_content = '\n'.join([lines[j][0] for j in range(start, i+1)])# 删除原有元素并插入合并后的元素del lines[start:i+1]lines.insert(start, [merged_content, "Fixed"])i += 1# 步骤3: 标记标题行for i in range(len(lines)):if lines[i][1] is None and re.match(r'^#+ ', lines[i][0]):lines[i][1] = "Title"# 步骤4: 合并连续的普通文本行i = 0while i < len(lines):if lines[i][1] is None:# 找到连续的未标记行j = icontent_parts = []while j < len(lines) and lines[j][1] is None:part = lines[j][0]# 如果行末是连字符，去掉连字符if part.endswith('-'):part = part[:-1]content_parts.append(part)j += 1# 合并这些行，用空格连接（除非前一行以连字符结尾）merged_content = ""for k, part in enumerate(content_parts):if k > 0 and not content_parts[k-1].endswith('-'):merged_content += " "merged_content += part# 删除原有元素并插入合并后的元素del lines[i:j]lines.insert(i, [merged_content, "Text"])i += 1# 步骤5: 拆分数学公式new_lines = []for line in lines:if line[1] == "Text":content = line[0]# 匹配$$...$$和$...$格式的数学公式parts = re.split(r'(\$\$[^$]+\$\$|\$[^$]+\$)', content)for part in parts:if part:if re.match(r'^\$\$[^$]+\$\$$|^ \$[^$]+\$ $', part):new_lines.append([part, "Equation"])elif re.match(r'^\$[^$]+\$$', part):new_lines.append([part, "Equation"])elif part.strip():new_lines.append([part, "Text"])else:new_lines.append(line)lines = new_lines# 步骤6: 处理短公式new_lines = []i = 0while i < len(lines):if lines[i][1] == "Equation":content = lines[i][0]# 检查是否为短公式（长度<10且不包含等号）stripped_content = content.strip('$')if len(stripped_content) < 30 and '=' not in stripped_content:# 合并到前一个Text元素if new_lines and new_lines[-1][1] == "Text":new_lines[-1][0] += contentelse:new_lines.append(["", "Text"])new_lines[-1][0] += content# 如果后面有Text元素，也合并if i+1 < len(lines) and lines[i+1][1] == "Text":new_lines[-1][0] += lines[i+1][0]i += 1  # 跳过下一个Text元素else:# 用双美元符号包围if content.startswith('$') and not content.startswith('$$'):new_content = '$$' + content.strip('$') + '$$'new_lines.append([new_content, "Equation"])else:new_lines.append([content, "Equation"])else:new_lines.append(lines[i])i += 1lines = new_lines# 步骤7: 将Text元素按句子分行for i in range(len(lines)):if lines[i][1] == "Text":content = lines[i][0]# 匹配句子分隔（小写字母+. +大写字母）content = re.sub(r'([a-z])\. ([A-Z])', r'\1.\n\2', content)lines[i][0] = content# 步骤8: 合并所有元素的正文部分output_content = '\n'.join([line[0] for line in lines])# 写入输出文件with open(output_file, 'w', encoding='utf-8') as f:f.write(output_content)# 使用示例
input_file = "source.md"
output_file = "processed.md"
process_markdown_file(input_file, output_file)

功能:将胡乱分行的pdf文件内容复制得到的md文件整理成正确的格式,每句话一行,如果公式较短则嵌在行内,否则单分行,大型md表格可使用注释进行分隔.

代码2:

import redef merge_bilingual_markdown(english_file, chinese_file, output_file):"""合并英文和中文 Markdown 文件为对照版本Args:english_file (str): 英文 Markdown 文件路径chinese_file (str): 中文翻译 Markdown 文件路径output_file (str): 输出对照文件路径"""# 读取两个文件的内容with open(english_file, 'r', encoding='utf-8') as f:english_lines = f.readlines()with open(chinese_file, 'r', encoding='utf-8') as f:chinese_lines = f.readlines()# 确保两个文件行数相同if len(english_lines) != len(chinese_lines):print(f"警告: 文件行数不同。英文文件 {len(english_lines)} 行，中文文件 {len(chinese_lines)} 行")max_lines = max(len(english_lines), len(chinese_lines))# 补齐较短的文件while len(english_lines) < max_lines:english_lines.append('\n')while len(chinese_lines) < max_lines:chinese_lines.append('\n')merged_lines = []for i in range(len(english_lines)):eng_line = english_lines[i].rstrip('\n')chn_line = chinese_lines[i].rstrip('\n')# 检查是否为空白行if is_empty_line(eng_line) and is_empty_line(chn_line):continue# 检查是否为标题行eng_is_heading = is_heading(eng_line)chn_is_heading = is_heading(chn_line)if eng_is_heading and chn_is_heading:# 都是标题行：合并标题merged_line = merge_headings(eng_line, chn_line)merged_lines.append(merged_line + '\n')else:# 检查是否为纯 LaTeX 行if is_latex_line(eng_line):# 如果是纯 LaTeX 行，只保留原文行并在后面加一个空白行merged_lines.append(eng_line + '\n')merged_lines.append('\n')else:# 正文行：英文在前，中文在后merged_lines.append(eng_line + '\n')if not is_empty_line(chn_line):merged_lines.append(chn_line + '\n')merged_lines.append('\n')  # 添加空白行# 写入输出文件with open(output_file, 'w', encoding='utf-8') as f:f.writelines(merged_lines)print(f"对照文件已保存至: {output_file}")def is_empty_line(line):"""判断是否为空白行（只包含空格或制表符）Args:line (str): 行内容Returns:bool: 是否为空白行"""return line.strip() == ''def is_heading(line):"""判断是否为标题行Args:line (str): 行内容Returns:bool: 是否为标题行"""return re.match(r'^#+ ', line.strip()) is not Nonedef is_latex_line(line):"""判断是否为纯 LaTeX 代码行Args:line (str): 行内容Returns:bool: 是否为纯 LaTeX 行"""stripped_line = line.strip()# LaTeX 行通常以 $$ 开头和结尾，或者以 \begin{...} 开头# 或者整行都是 LaTeX 命令（以反斜杠开头）if stripped_line.startswith('$$') and stripped_line.endswith('$$'):return Trueif stripped_line.startswith('\\begin{') and stripped_line.endswith('\\end{'):return Trueif re.match(r'^\s*\\[a-zA-Z]+', stripped_line):# 检查是否整行都是 LaTeX 命令return Trueif stripped_line.startswith('$') and stripped_line.endswith('$') and stripped_line != '$':return Truereturn Falsedef merge_headings(eng_heading, chn_heading):"""合并英文和中文标题行Args:eng_heading (str): 英文标题行chn_heading (str): 中文标题行Returns:str: 合并后的标题行"""# 提取英文标题内容（保留井号）eng_content = eng_heading# 提取中文标题内容（去除井号）chn_content = chn_heading.lstrip('# ').strip()# 合并为一行，用空格连接return f"{eng_content} {chn_content}"# 使用示例
if __name__ == "__main__":english_file = "导出.md"chinese_file = "翻译.md"output_file = "对照.md"merge_bilingual_markdown(english_file, chinese_file, output_file)

功能:将翻译得到的中文文本与英文文本逐行对照,中文标题与英文标题放在同一行以便于梳理文章结构,纯公式行单开一行,不翻译

查看全文

http://www.hskmm.com/?act=detail&tid=16912