md2word/transit/parser.py

"""
Markdown 论文解析器。

将结构化 Markdown（摘要、正文、致谢、参考文献、附录）解析为字典，
供 docx 模板渲染使用。
"""

import re
from typing import Optional

# 匹配任意级数的标题：^## 或 ^### 等，支持可选的数字编号
_RE_HEADING = re.compile(r"^(#{1,6})\s*(?:\d+(?:\.\d+)*\s*)?(.+)$", re.MULTILINE)


def _strip_front_matter(content: str) -> str:
    """移除 YAML front matter（``---`` 包裹的头部块）。"""
    if content.startswith("---"):
        end = content.find("---", 3)
        if end != -1:
            return content[end + 3 :]
    return content


def _find_section(
    content: str, titles: list[str], after: int = 0
) -> Optional[tuple[int, int, str]]:
    """查找第一个匹配的章节，返回 ``(section_start, section_end, matched_title)``。

    章节范围从标题行开始，到下一个同级/更高级标题结束（或内容结尾）。
    """
    for m in _RE_HEADING.finditer(content, after):
        raw_text = m.group(2).strip()
        for t in titles:
            if raw_text == t or raw_text.endswith(t):
                rest = content[m.end() :]
                next_m = _RE_HEADING.search(rest)
                section_end = m.end() + (next_m.start() if next_m else len(rest))
                return (m.start(), section_end, t)
    return None


def _get_section_body(content: str, section_info: tuple) -> str:
    """从 section_info 中提取标题行之后的纯章节正文。"""
    hdr_m = _RE_HEADING.match(content, section_info[0])
    if not hdr_m:
        return ""
    return content[hdr_m.end() : section_info[1]].strip()


def parse_markdown(
    md_text: str,
    body_start_kw: list[str] | None = None,
    body_end_kw: list[str] | None = None,
) -> dict:
    """解析 Markdown 格式的论文文本，返回模板变量字典。

    Parameters
    ----------
    md_text : str
        完整的 Markdown 文本。
    body_start_kw : list[str] | None
        标识正文开始的章节名列表，默认 [``绪论``, ``引言``]。
    body_end_kw : list[str] | None
        标识正文结束的章节名列表，默认 [``致谢``, ``参考文献``, ``附录``]。
    """
    if body_start_kw is None:
        body_start_kw = ["绪论", "引言"]
    if body_end_kw is None:
        body_end_kw = ["致谢", "参考文献", "附录"]

    content = _strip_front_matter(md_text.strip())
    data: dict = {}

    # ── 标题（第一个 # 标题） ──
    title_m = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
    if title_m:
        data["title"] = title_m.group(1).strip()

    # ── 中文摘要 ──
    abs_cn = _find_section(content, ["摘  要", "摘要"])
    if abs_cn:
        sec_body = _get_section_body(content, abs_cn)
        kw_m = re.search(
            r"\*\*关键词[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
        )
        if kw_m:
            data["abstact_cn_context"] = sec_body[: kw_m.start()].strip()
            data["abstract_cn_keywords"] = kw_m.group(1).strip()
        else:
            data["abstact_cn_context"] = sec_body

    # ── 英文摘要 ──
    abs_en = _find_section(content, ["Abstract"])
    if abs_en:
        sec_body = _get_section_body(content, abs_en)
        kw_m = re.search(
            r"\*\*Key words[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
        )
        if kw_m:
            data["abstract_en_context"] = sec_body[: kw_m.start()].strip()
            data["abstract_en_keywords"] = kw_m.group(1).strip()
        else:
            data["abstract_en_context"] = sec_body

    # ── 正文（从绪论/引言到致谢/参考文献/附录） ──
    body_start = _find_section(content, body_start_kw)
    if body_start:
        body_content = content[body_start[0] :]
        body_end = _find_section(body_content, body_end_kw)
        if body_end:
            body_content = body_content[: body_end[0]]
        data["body_md"] = body_content.strip()
    else:
        data["body_md"] = ""

    # ── 致谢 ──
    ack = _find_section(content, ["致谢"])
    if ack:
        data["acknowledgement"] = content[ack[0] : ack[1]].strip()

    # ── 参考文献 ──
    ref = _find_section(content, ["参考文献"])
    if ref:
        data["reference"] = content[ref[0] : ref[1]].strip()

    # ── 附录 ──
    app = _find_section(content, ["附录"])
    if app:
        data["appendix"] = content[app[0] : app[1]].strip()

    return data