refactor: 重构项目结构并更新依赖配置

- 移除原有的 docx_thesis 模块及其相关文件 (cli.py, config.py, converter.py) - 新增 .claudeignore 文件以忽略 Python 生成文件和缓存 - 更新 .gitignore 文件添加更多忽略规则包括 .mypy_cache/, .ruff_cache/, .claude/, *.md 等 - 添加 README.md 使用说明文档 - 修改 pyproject.toml 依赖配置，新增 docxtpl、pyyaml，移除原 thesis 命令入口点并更新为 transit.__main__ - 新增 transit 模块及相应初始化文件 - 重命名 main.py 为快速入口脚本
2026-05-08 21:06:01 +08:00
parent 5cbc1d9b76
commit ae70d05672
16 changed files with 697 additions and 941 deletions
--- a/transit/init.py
+++ b/transit/init.py
@@ -0,0 +1,15 @@
+"""transit —— 毕业论文 Markdown → Word 格式转换工具。"""
+
+from .parser import parse_markdown
+from .body import body_to_paragraphs, replace_placeholder
+from .renderer import generate_thesis
+from .config import load_config, ThesisConfig
+
+__all__ = [
+    "parse_markdown",
+    "body_to_paragraphs",
+    "replace_placeholder",
+    "generate_thesis",
+    "load_config",
+    "ThesisConfig",
+]
--- a/transit/main.py
+++ b/transit/main.py
@@ -0,0 +1,34 @@
+"""CLI 入口：python -m transit"""
+
+import argparse
+from pathlib import Path
+from .renderer import generate_thesis
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="毕业论文 Markdown → Word 格式转换工具"
+    )
+    parser.add_argument("data", type=str, help="Markdown 正文文件路径（.md）")
+    parser.add_argument(
+        "-t", "--template", default="sample.docx", help="docx 模板文件路径（默认: sample.docx）"
+    )
+    parser.add_argument(
+        "-o", "--output", default="output.docx", help="输出 Word 文件路径（默认: output.docx）"
+    )
+    parser.add_argument(
+        "-c", "--config", default=None, help="TOML 配置文件路径（可选）"
+    )
+
+    args = parser.parse_args()
+
+    generate_thesis(
+        template_path=args.template,
+        data_path=args.data,
+        config_path=args.config,
+        output_path=args.output,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/transit/body.py
+++ b/transit/body.py
@@ -0,0 +1,87 @@
+"""
+Markdown 正文 → Word 段落转换。
+
+将正文 Markdown 按标题层级拆分为带样式的段落序列，
+再注入到渲染后 docx 文档的占位符位置。
+"""
+
+import re
+from docx import Document
+
+_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
+
+
+def body_to_paragraphs(md_text: str) -> list[dict]:
+    """将 Markdown 正文按标题和段落拆分为结构化列表。
+
+    返回的每个元素::
+        {"text": str, "level": int, "style": str}
+    其中 ``style`` 为 ``Heading N`` 或 ``Normal``。
+    """
+    paragraphs: list[dict] = []
+    last_end = 0
+
+    for m in _PAT_HEADING.finditer(md_text):
+        # 标题前的普通文本
+        if m.start() > last_end:
+            pre = md_text[last_end : m.start()].strip()
+            if pre:
+                for block in re.split(r"\n\s*\n", pre):
+                    block = block.strip()
+                    if block:
+                        paragraphs.append(
+                            {"text": block, "level": 0, "style": "Normal"}
+                        )
+
+        level = len(m.group(1))
+        heading_text = m.group(2).strip()
+        paragraphs.append(
+            {"text": heading_text, "level": level, "style": f"Heading {level}"}
+        )
+        last_end = m.end()
+
+    # 最后一段 / 尾部文本
+    tail = md_text[last_end:].strip()
+    if tail:
+        for block in re.split(r"\n\s*\n", tail):
+            block = block.strip()
+            if block:
+                paragraphs.append(
+                    {"text": block, "level": 0, "style": "Normal"}
+                )
+
+    return paragraphs
+
+
+def replace_placeholder(doc: Document, placeholder: str, paragraphs: list[dict]):
+    """在 *doc* 中找到包含 *placeholder* 的段落，替换为 *paragraphs* 列表。
+
+    每个段落的 ``style`` 字段会从文档样式中查找并应用。
+    """
+    placeholder_found = False
+    for para in doc.paragraphs:
+        if placeholder in para.text:
+            placeholder_found = True
+            parent = para._element.getparent()
+            idx = list(parent).index(para._element)
+            parent.remove(para._element)
+
+            for pd_data in reversed(paragraphs):
+                new_p = doc.add_paragraph(pd_data["text"])
+                style_name = pd_data["style"]
+                try:
+                    new_p.style = doc.styles[style_name]
+                except KeyError:
+                    matched = False
+                    for s in doc.styles:
+                        if s.name.lower() == style_name.lower():
+                            new_p.style = s
+                            matched = True
+                            break
+                    if not matched:
+                        new_p.style = doc.styles["Normal"]
+                parent.insert(idx, new_p._element)
+            break
+
+    if not placeholder_found:
+        print(f"警告：未找到占位符 '{placeholder}'，正文段落未注入。")
--- a/transit/config.py
+++ b/transit/config.py
@@ -0,0 +1,63 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import tomllib
+
+
+@dataclass
+class ThesisConfig:
+    """论文配置数据（学生信息、元数据等，不包含正文内容）。"""
+
+    student_name: str = "<None>"
+    student_id: str = "<None>"
+    college: str = "<None>"
+    major: str = "<None>"
+    class_: str = "<None>"
+    advisor: str = "<None>"
+    advisor_title: str = "<None>"
+    title: str = "<None>"
+
+    title_from_md: bool = True
+    body_start_keywords: list[str] = field(default_factory=lambda: ["绪论", "引言"])
+    body_end_keywords: list[str] = field(
+        default_factory=lambda: ["致谢", "参考文献", "附录"]
+    )
+
+    def to_dict(self) -> dict:
+        """转成模板渲染用的扁平字典，排除 options 命名空间。"""
+        return {
+            "student_name": self.student_name,
+            "student_id": self.student_id,
+            "college": self.college,
+            "major": self.major,
+            "class": self.class_,
+            "advisor": self.advisor,
+            "advisor_title": self.advisor_title,
+            "title": self.title,
+        }
+
+
+def load_config(path: str | Path) -> ThesisConfig:
+    """从 TOML 文件加载论文配置。"""
+    path = Path(path)
+    with open(path, "rb") as f:
+        raw = tomllib.load(f)
+
+    meta = raw.get("metadata", {})
+    opts = raw.get("options", {})
+
+    return ThesisConfig(
+        student_name=meta.get("student_name", "<None>"),
+        student_id=meta.get("student_id", "<None>"),
+        college=meta.get("college", "<None>"),
+        major=meta.get("major", "<None>"),
+        class_=meta.get("class", "<None>"),
+        advisor=meta.get("advisor", "<None>"),
+        advisor_title=meta.get("advisor_title", "<None>"),
+        title=meta.get("title", "<None>"),
+        title_from_md=opts.get("title_from_md", True),
+        body_start_keywords=opts.get("body_start_keywords", ["绪论", "引言"]),
+        body_end_keywords=opts.get(
+            "body_end_keywords", ["致谢", "参考文献", "附录"]
+        ),
+    )
--- a/transit/parser.py
+++ b/transit/parser.py
@@ -0,0 +1,131 @@
+"""
+Markdown 论文解析器。
+
+将结构化 Markdown（摘要、正文、致谢、参考文献、附录）解析为字典，
+供 docx 模板渲染使用。
+"""
+
+import re
+from typing import Optional
+
+# 匹配任意级数的标题：^## 或 ^### 等，支持可选的数字编号
+_RE_HEADING = re.compile(r"^(#{1,6})\s*(?:\d+(?:\.\d+)*\s*)?(.+)$", re.MULTILINE)
+
+
+def _strip_front_matter(content: str) -> str:
+    """移除 YAML front matter（``---`` 包裹的头部块）。"""
+    if content.startswith("---"):
+        end = content.find("---", 3)
+        if end != -1:
+            return content[end + 3 :]
+    return content
+
+
+def _find_section(
+    content: str, titles: list[str], after: int = 0
+) -> Optional[tuple[int, int, str]]:
+    """查找第一个匹配的章节，返回 ``(section_start, section_end, matched_title)``。
+
+    章节范围从标题行开始，到下一个同级/更高级标题结束（或内容结尾）。
+    """
+    for m in _RE_HEADING.finditer(content, after):
+        raw_text = m.group(2).strip()
+        for t in titles:
+            if raw_text == t or raw_text.endswith(t):
+                rest = content[m.end() :]
+                next_m = _RE_HEADING.search(rest)
+                section_end = m.end() + (next_m.start() if next_m else len(rest))
+                return (m.start(), section_end, t)
+    return None
+
+
+def _get_section_body(content: str, section_info: tuple) -> str:
+    """从 section_info 中提取标题行之后的纯章节正文。"""
+    hdr_m = _RE_HEADING.match(content, section_info[0])
+    if not hdr_m:
+        return ""
+    return content[hdr_m.end() : section_info[1]].strip()
+
+
+def parse_markdown(
+    md_text: str,
+    body_start_kw: list[str] | None = None,
+    body_end_kw: list[str] | None = None,
+) -> dict:
+    """解析 Markdown 格式的论文文本，返回模板变量字典。
+
+    Parameters
+    ----------
+    md_text : str
+        完整的 Markdown 文本。
+    body_start_kw : list[str] | None
+        标识正文开始的章节名列表，默认 [``绪论``, ``引言``]。
+    body_end_kw : list[str] | None
+        标识正文结束的章节名列表，默认 [``致谢``, ``参考文献``, ``附录``]。
+    """
+    if body_start_kw is None:
+        body_start_kw = ["绪论", "引言"]
+    if body_end_kw is None:
+        body_end_kw = ["致谢", "参考文献", "附录"]
+
+    content = _strip_front_matter(md_text.strip())
+    data: dict = {}
+
+    # ── 标题（第一个 # 标题） ──
+    title_m = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
+    if title_m:
+        data["title"] = title_m.group(1).strip()
+
+    # ── 中文摘要 ──
+    abs_cn = _find_section(content, ["摘  要", "摘要"])
+    if abs_cn:
+        sec_body = _get_section_body(content, abs_cn)
+        kw_m = re.search(
+            r"\*\*关键词[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
+        )
+        if kw_m:
+            data["abstact_cn_context"] = sec_body[: kw_m.start()].strip()
+            data["abstract_cn_keywords"] = kw_m.group(1).strip()
+        else:
+            data["abstact_cn_context"] = sec_body
+
+    # ── 英文摘要 ──
+    abs_en = _find_section(content, ["Abstract"])
+    if abs_en:
+        sec_body = _get_section_body(content, abs_en)
+        kw_m = re.search(
+            r"\*\*Key words[：:]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
+        )
+        if kw_m:
+            data["abstract_en_context"] = sec_body[: kw_m.start()].strip()
+            data["abstract_en_keywords"] = kw_m.group(1).strip()
+        else:
+            data["abstract_en_context"] = sec_body
+
+    # ── 正文（从绪论/引言到致谢/参考文献/附录） ──
+    body_start = _find_section(content, body_start_kw)
+    if body_start:
+        body_content = content[body_start[0] :]
+        body_end = _find_section(body_content, body_end_kw)
+        if body_end:
+            body_content = body_content[: body_end[0]]
+        data["body_md"] = body_content.strip()
+    else:
+        data["body_md"] = ""
+
+    # ── 致谢 ──
+    ack = _find_section(content, ["致谢"])
+    if ack:
+        data["acknowledgement"] = content[ack[0] : ack[1]].strip()
+
+    # ── 参考文献 ──
+    ref = _find_section(content, ["参考文献"])
+    if ref:
+        data["reference"] = content[ref[0] : ref[1]].strip()
+
+    # ── 附录 ──
+    app = _find_section(content, ["附录"])
+    if app:
+        data["appendix"] = content[app[0] : app[1]].strip()
+
+    return data
--- a/transit/renderer.py
+++ b/transit/renderer.py
@@ -0,0 +1,132 @@
+"""
+论文生成编排器。
+
+组装 配置 + 解析 + 模板渲染 + 正文注入 的完整流水线。
+"""
+
+from collections import defaultdict
+from pathlib import Path
+from docxtpl import DocxTemplate
+from docx import Document
+
+from .config import load_config, ThesisConfig
+from .parser import parse_markdown
+from .body import body_to_paragraphs, replace_placeholder
+
+
+_TEXT_FIELDS = [
+    "title",
+    "abstact_cn_context",
+    "abstract_cn_keywords",
+    "abstract_en_context",
+    "abstract_en_keywords",
+    "acknowledgement",
+    "reference",
+    "appendix",
+    "student_name",
+    "student_id",
+    "college",
+    "major",
+    "class",
+    "advisor",
+    "advisor_title",
+]
+
+
+def generate_thesis(
+    template_path: str | Path,
+    data_path: str | Path,
+    config_path: str | Path | None = None,
+    output_path: str | Path = "output.docx",
+) -> dict:
+    """执行从数据到 Word 的完整论文生成流程。
+
+    Parameters
+    ----------
+    template_path : str | Path
+        docxtpl 模板文件路径（.docx）。
+    data_path : str | Path
+        Markdown 论文正文文件路径（.md）。
+    config_path : str | Path | None
+        TOML 配置文件路径。为 ``None`` 时尝试自动查找。
+    output_path : str | Path
+        输出 Word 文件路径。
+    """
+    data_path = Path(data_path)
+
+    # 1. 加载配置
+    if config_path is None:
+        candidates = [
+            Path("thesis_config.toml"),
+            data_path.with_suffix(".toml"),
+        ]
+        config_path = next((p for p in candidates if p.exists()), None)
+
+    config: ThesisConfig | None = None
+    if config_path and Path(config_path).exists():
+        config = load_config(config_path)
+        print(f"[配置] 配置文件: {config_path}")
+    else:
+        config = ThesisConfig()
+        print("[配置] 未找到配置文件，使用默认值。")
+
+    # 2. 解析 Markdown
+    with open(data_path, "r", encoding="utf-8") as f:
+        md_text = f.read()
+
+    context = parse_markdown(
+        md_text,
+        body_start_kw=config.body_start_keywords,
+        body_end_kw=config.body_end_keywords,
+    )
+
+    # 3. 合并配置 → 上下文（配置优先）
+    for k, v in config.to_dict().items():
+        if k == "title" and config.title_from_md and context.get("title"):
+            continue  # 以 markdown 标题为准
+        if v != "<None>":
+            context[k] = v
+
+    # 4. 用 defaultdict 兜底缺失键
+    ctx = defaultdict(lambda: "<None>", context)
+
+    # 5. 解析正文为段落列表
+    body_md = ctx.get("body_md", "")
+    body_paragraphs = body_to_paragraphs(body_md) if body_md else []
+
+    # 6. 占位符
+    ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__"
+
+    # 7. 渲染模板
+    doc = DocxTemplate(str(template_path))
+    doc.render(ctx)
+
+    # 8. 保存临时文件，再做后处理
+    temp_path = Path(output_path).with_suffix(".tmp")
+    doc.save(str(temp_path))
+
+    # 9. 正文注入
+    final_doc = Document(str(temp_path))
+    replace_placeholder(final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs)
+    final_doc.save(str(output_path))
+    temp_path.unlink(missing_ok=True)
+
+    print(f"[完成] 论文生成完成: {output_path}")
+
+    # 10. 字段填充报告
+    print("\n--- 字段填充情况 ---")
+    for key in _TEXT_FIELDS:
+        val = ctx[key]
+        if val == "<None>":
+            print(f"  [缺失] {key}")
+        else:
+            preview = str(val)[:60].replace("\n", " ")
+            print(f"  [OK] {key}: {preview}...")
+
+    missing = [k for k in _TEXT_FIELDS if ctx[k] == "<None>"]
+    if missing:
+        print("\n[警告] 以下字段缺失，已填充 '<None>'：")
+        for f in missing:
+            print(f"  - {f}")
+
+    return dict(ctx)