refactor: 重构项目结构并更新依赖配置

- 移除原有的 docx_thesis 模块及其相关文件 (cli.py, config.py, converter.py) - 新增 .claudeignore 文件以忽略 Python 生成文件和缓存 - 更新 .gitignore 文件添加更多忽略规则包括 .mypy_cache/, .ruff_cache/, .claude/, *.md 等 - 添加 README.md 使用说明文档 - 修改 pyproject.toml 依赖配置，新增 docxtpl、pyyaml，移除原 thesis 命令入口点并更新为 transit.__main__ - 新增 transit 模块及相应初始化文件 - 重命名 main.py 为快速入口脚本
2026-05-08 21:06:01 +08:00
parent 5cbc1d9b76
commit ae70d05672
16 changed files with 697 additions and 941 deletions
--- a/transit/parser.py
+++ b/transit/parser.py
@@ -0,0 +1,131 @@
+"""
+Markdown 论文解析器。
+
+将结构化 Markdown（摘要、正文、致谢、参考文献、附录）解析为字典，
+供 docx 模板渲染使用。
+"""
+
+import re
+from typing import Optional
+
+# 匹配任意级数的标题：^## 或 ^### 等，支持可选的数字编号
+_RE_HEADING = re.compile(r"^(#{1,6})\s*(?:\d+(?:\.\d+)*\s*)?(.+)$", re.MULTILINE)
+
+
+def _strip_front_matter(content: str) -> str:
+    """移除 YAML front matter（``---`` 包裹的头部块）。"""
+    if content.startswith("---"):
+        end = content.find("---", 3)
+        if end != -1:
+            return content[end + 3 :]
+    return content
+
+
+def _find_section(
+    content: str, titles: list[str], after: int = 0
+) -> Optional[tuple[int, int, str]]:
+    """查找第一个匹配的章节，返回 ``(section_start, section_end, matched_title)``。
+
+    章节范围从标题行开始，到下一个同级/更高级标题结束（或内容结尾）。
+    """
+    for m in _RE_HEADING.finditer(content, after):
+        raw_text = m.group(2).strip()
+        for t in titles:
+            if raw_text == t or raw_text.endswith(t):
+                rest = content[m.end() :]
+                next_m = _RE_HEADING.search(rest)
+                section_end = m.end() + (next_m.start() if next_m else len(rest))
+                return (m.start(), section_end, t)
+    return None
+
+
+def _get_section_body(content: str, section_info: tuple) -> str:
+    """从 section_info 中提取标题行之后的纯章节正文。"""
+    hdr_m = _RE_HEADING.match(content, section_info[0])
+    if not hdr_m:
+        return ""
+    return content[hdr_m.end() : section_info[1]].strip()
+
+
+def parse_markdown(
+    md_text: str,
+    body_start_kw: list[str] | None = None,
+    body_end_kw: list[str] | None = None,
+) -> dict:
+    """解析 Markdown 格式的论文文本，返回模板变量字典。
+
+    Parameters
+    ----------
+    md_text : str
+        完整的 Markdown 文本。
+    body_start_kw : list[str] | None
+        标识正文开始的章节名列表，默认 [``绪论``, ``引言``]。
+    body_end_kw : list[str] | None
+        标识正文结束的章节名列表，默认 [``致谢``, ``参考文献``, ``附录``]。
+    """
+    if body_start_kw is None:
+        body_start_kw = ["绪论", "引言"]
+    if body_end_kw is None:
+        body_end_kw = ["致谢", "参考文献", "附录"]
+
+    content = _strip_front_matter(md_text.strip())
+    data: dict = {}
+
+    # ── 标题（第一个 # 标题） ──
+    title_m = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
+    if title_m:
+        data["title"] = title_m.group(1).strip()
+
+    # ── 中文摘要 ──
+    abs_cn = _find_section(content, ["摘  要", "摘要"])
+    if abs_cn:
+        sec_body = _get_section_body(content, abs_cn)
+        kw_m = re.search(
+            r"\*\*关键词[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
+        )
+        if kw_m:
+            data["abstact_cn_context"] = sec_body[: kw_m.start()].strip()
+            data["abstract_cn_keywords"] = kw_m.group(1).strip()
+        else:
+            data["abstact_cn_context"] = sec_body
+
+    # ── 英文摘要 ──
+    abs_en = _find_section(content, ["Abstract"])
+    if abs_en:
+        sec_body = _get_section_body(content, abs_en)
+        kw_m = re.search(
+            r"\*\*Key words[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
+        )
+        if kw_m:
+            data["abstract_en_context"] = sec_body[: kw_m.start()].strip()
+            data["abstract_en_keywords"] = kw_m.group(1).strip()
+        else:
+            data["abstract_en_context"] = sec_body
+
+    # ── 正文（从绪论/引言到致谢/参考文献/附录） ──
+    body_start = _find_section(content, body_start_kw)
+    if body_start:
+        body_content = content[body_start[0] :]
+        body_end = _find_section(body_content, body_end_kw)
+        if body_end:
+            body_content = body_content[: body_end[0]]
+        data["body_md"] = body_content.strip()
+    else:
+        data["body_md"] = ""
+
+    # ── 致谢 ──
+    ack = _find_section(content, ["致谢"])
+    if ack:
+        data["acknowledgement"] = content[ack[0] : ack[1]].strip()
+
+    # ── 参考文献 ──
+    ref = _find_section(content, ["参考文献"])
+    if ref:
+        data["reference"] = content[ref[0] : ref[1]].strip()
+
+    # ── 附录 ──
+    app = _find_section(content, ["附录"])
+    if app:
+        data["appendix"] = content[app[0] : app[1]].strip()
+
+    return data