refactor: 重构项目结构并更新依赖配置
- 移除原有的 docx_thesis 模块及其相关文件 (cli.py, config.py, converter.py) - 新增 .claudeignore 文件以忽略 Python 生成文件和缓存 - 更新 .gitignore 文件添加更多忽略规则包括 .mypy_cache/, .ruff_cache/, .claude/, *.md 等 - 添加 README.md 使用说明文档 - 修改 pyproject.toml 依赖配置,新增 docxtpl、pyyaml, 移除原 thesis 命令入口点并更新为 transit.__main__ - 新增 transit 模块及相应初始化文件 - 重命名 main.py 为快速入口脚本
This commit is contained in:
15
transit/__init__.py
Normal file
15
transit/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""transit —— 毕业论文 Markdown → Word 格式转换工具。"""
|
||||
|
||||
from .parser import parse_markdown
|
||||
from .body import body_to_paragraphs, replace_placeholder
|
||||
from .renderer import generate_thesis
|
||||
from .config import load_config, ThesisConfig
|
||||
|
||||
__all__ = [
|
||||
"parse_markdown",
|
||||
"body_to_paragraphs",
|
||||
"replace_placeholder",
|
||||
"generate_thesis",
|
||||
"load_config",
|
||||
"ThesisConfig",
|
||||
]
|
||||
34
transit/__main__.py
Normal file
34
transit/__main__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""CLI 入口:python -m transit"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from .renderer import generate_thesis
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="毕业论文 Markdown → Word 格式转换工具"
|
||||
)
|
||||
parser.add_argument("data", type=str, help="Markdown 正文文件路径(.md)")
|
||||
parser.add_argument(
|
||||
"-t", "--template", default="sample.docx", help="docx 模板文件路径(默认: sample.docx)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output", default="output.docx", help="输出 Word 文件路径(默认: output.docx)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config", default=None, help="TOML 配置文件路径(可选)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_thesis(
|
||||
template_path=args.template,
|
||||
data_path=args.data,
|
||||
config_path=args.config,
|
||||
output_path=args.output,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
87
transit/body.py
Normal file
87
transit/body.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Markdown 正文 → Word 段落转换。
|
||||
|
||||
将正文 Markdown 按标题层级拆分为带样式的段落序列,
|
||||
再注入到渲染后 docx 文档的占位符位置。
|
||||
"""
|
||||
|
||||
import re
|
||||
from docx import Document
|
||||
|
||||
_PAT_HEADING = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
||||
|
||||
|
||||
def body_to_paragraphs(md_text: str) -> list[dict]:
|
||||
"""将 Markdown 正文按标题和段落拆分为结构化列表。
|
||||
|
||||
返回的每个元素::
|
||||
{"text": str, "level": int, "style": str}
|
||||
其中 ``style`` 为 ``Heading N`` 或 ``Normal``。
|
||||
"""
|
||||
paragraphs: list[dict] = []
|
||||
last_end = 0
|
||||
|
||||
for m in _PAT_HEADING.finditer(md_text):
|
||||
# 标题前的普通文本
|
||||
if m.start() > last_end:
|
||||
pre = md_text[last_end : m.start()].strip()
|
||||
if pre:
|
||||
for block in re.split(r"\n\s*\n", pre):
|
||||
block = block.strip()
|
||||
if block:
|
||||
paragraphs.append(
|
||||
{"text": block, "level": 0, "style": "Normal"}
|
||||
)
|
||||
|
||||
level = len(m.group(1))
|
||||
heading_text = m.group(2).strip()
|
||||
paragraphs.append(
|
||||
{"text": heading_text, "level": level, "style": f"Heading {level}"}
|
||||
)
|
||||
last_end = m.end()
|
||||
|
||||
# 最后一段 / 尾部文本
|
||||
tail = md_text[last_end:].strip()
|
||||
if tail:
|
||||
for block in re.split(r"\n\s*\n", tail):
|
||||
block = block.strip()
|
||||
if block:
|
||||
paragraphs.append(
|
||||
{"text": block, "level": 0, "style": "Normal"}
|
||||
)
|
||||
|
||||
return paragraphs
|
||||
|
||||
|
||||
def replace_placeholder(doc: Document, placeholder: str, paragraphs: list[dict]):
|
||||
"""在 *doc* 中找到包含 *placeholder* 的段落,替换为 *paragraphs* 列表。
|
||||
|
||||
每个段落的 ``style`` 字段会从文档样式中查找并应用。
|
||||
"""
|
||||
placeholder_found = False
|
||||
for para in doc.paragraphs:
|
||||
if placeholder in para.text:
|
||||
placeholder_found = True
|
||||
parent = para._element.getparent()
|
||||
idx = list(parent).index(para._element)
|
||||
parent.remove(para._element)
|
||||
|
||||
for pd_data in reversed(paragraphs):
|
||||
new_p = doc.add_paragraph(pd_data["text"])
|
||||
style_name = pd_data["style"]
|
||||
try:
|
||||
new_p.style = doc.styles[style_name]
|
||||
except KeyError:
|
||||
matched = False
|
||||
for s in doc.styles:
|
||||
if s.name.lower() == style_name.lower():
|
||||
new_p.style = s
|
||||
matched = True
|
||||
break
|
||||
if not matched:
|
||||
new_p.style = doc.styles["Normal"]
|
||||
parent.insert(idx, new_p._element)
|
||||
break
|
||||
|
||||
if not placeholder_found:
|
||||
print(f"警告:未找到占位符 '{placeholder}',正文段落未注入。")
|
||||
63
transit/config.py
Normal file
63
transit/config.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import tomllib
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThesisConfig:
|
||||
"""论文配置数据(学生信息、元数据等,不包含正文内容)。"""
|
||||
|
||||
student_name: str = "<None>"
|
||||
student_id: str = "<None>"
|
||||
college: str = "<None>"
|
||||
major: str = "<None>"
|
||||
class_: str = "<None>"
|
||||
advisor: str = "<None>"
|
||||
advisor_title: str = "<None>"
|
||||
title: str = "<None>"
|
||||
|
||||
title_from_md: bool = True
|
||||
body_start_keywords: list[str] = field(default_factory=lambda: ["绪论", "引言"])
|
||||
body_end_keywords: list[str] = field(
|
||||
default_factory=lambda: ["致谢", "参考文献", "附录"]
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""转成模板渲染用的扁平字典,排除 options 命名空间。"""
|
||||
return {
|
||||
"student_name": self.student_name,
|
||||
"student_id": self.student_id,
|
||||
"college": self.college,
|
||||
"major": self.major,
|
||||
"class": self.class_,
|
||||
"advisor": self.advisor,
|
||||
"advisor_title": self.advisor_title,
|
||||
"title": self.title,
|
||||
}
|
||||
|
||||
|
||||
def load_config(path: str | Path) -> ThesisConfig:
|
||||
"""从 TOML 文件加载论文配置。"""
|
||||
path = Path(path)
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
|
||||
meta = raw.get("metadata", {})
|
||||
opts = raw.get("options", {})
|
||||
|
||||
return ThesisConfig(
|
||||
student_name=meta.get("student_name", "<None>"),
|
||||
student_id=meta.get("student_id", "<None>"),
|
||||
college=meta.get("college", "<None>"),
|
||||
major=meta.get("major", "<None>"),
|
||||
class_=meta.get("class", "<None>"),
|
||||
advisor=meta.get("advisor", "<None>"),
|
||||
advisor_title=meta.get("advisor_title", "<None>"),
|
||||
title=meta.get("title", "<None>"),
|
||||
title_from_md=opts.get("title_from_md", True),
|
||||
body_start_keywords=opts.get("body_start_keywords", ["绪论", "引言"]),
|
||||
body_end_keywords=opts.get(
|
||||
"body_end_keywords", ["致谢", "参考文献", "附录"]
|
||||
),
|
||||
)
|
||||
131
transit/parser.py
Normal file
131
transit/parser.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Markdown 论文解析器。
|
||||
|
||||
将结构化 Markdown(摘要、正文、致谢、参考文献、附录)解析为字典,
|
||||
供 docx 模板渲染使用。
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
# 匹配任意级数的标题:^## 或 ^### 等,支持可选的数字编号
|
||||
_RE_HEADING = re.compile(r"^(#{1,6})\s*(?:\d+(?:\.\d+)*\s*)?(.+)$", re.MULTILINE)
|
||||
|
||||
|
||||
def _strip_front_matter(content: str) -> str:
|
||||
"""移除 YAML front matter(``---`` 包裹的头部块)。"""
|
||||
if content.startswith("---"):
|
||||
end = content.find("---", 3)
|
||||
if end != -1:
|
||||
return content[end + 3 :]
|
||||
return content
|
||||
|
||||
|
||||
def _find_section(
|
||||
content: str, titles: list[str], after: int = 0
|
||||
) -> Optional[tuple[int, int, str]]:
|
||||
"""查找第一个匹配的章节,返回 ``(section_start, section_end, matched_title)``。
|
||||
|
||||
章节范围从标题行开始,到下一个同级/更高级标题结束(或内容结尾)。
|
||||
"""
|
||||
for m in _RE_HEADING.finditer(content, after):
|
||||
raw_text = m.group(2).strip()
|
||||
for t in titles:
|
||||
if raw_text == t or raw_text.endswith(t):
|
||||
rest = content[m.end() :]
|
||||
next_m = _RE_HEADING.search(rest)
|
||||
section_end = m.end() + (next_m.start() if next_m else len(rest))
|
||||
return (m.start(), section_end, t)
|
||||
return None
|
||||
|
||||
|
||||
def _get_section_body(content: str, section_info: tuple) -> str:
|
||||
"""从 section_info 中提取标题行之后的纯章节正文。"""
|
||||
hdr_m = _RE_HEADING.match(content, section_info[0])
|
||||
if not hdr_m:
|
||||
return ""
|
||||
return content[hdr_m.end() : section_info[1]].strip()
|
||||
|
||||
|
||||
def parse_markdown(
|
||||
md_text: str,
|
||||
body_start_kw: list[str] | None = None,
|
||||
body_end_kw: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""解析 Markdown 格式的论文文本,返回模板变量字典。
|
||||
|
||||
Parameters
|
||||
----------
|
||||
md_text : str
|
||||
完整的 Markdown 文本。
|
||||
body_start_kw : list[str] | None
|
||||
标识正文开始的章节名列表,默认 [``绪论``, ``引言``]。
|
||||
body_end_kw : list[str] | None
|
||||
标识正文结束的章节名列表,默认 [``致谢``, ``参考文献``, ``附录``]。
|
||||
"""
|
||||
if body_start_kw is None:
|
||||
body_start_kw = ["绪论", "引言"]
|
||||
if body_end_kw is None:
|
||||
body_end_kw = ["致谢", "参考文献", "附录"]
|
||||
|
||||
content = _strip_front_matter(md_text.strip())
|
||||
data: dict = {}
|
||||
|
||||
# ── 标题(第一个 # 标题) ──
|
||||
title_m = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
||||
if title_m:
|
||||
data["title"] = title_m.group(1).strip()
|
||||
|
||||
# ── 中文摘要 ──
|
||||
abs_cn = _find_section(content, ["摘 要", "摘要"])
|
||||
if abs_cn:
|
||||
sec_body = _get_section_body(content, abs_cn)
|
||||
kw_m = re.search(
|
||||
r"\*\*关键词[::]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
|
||||
)
|
||||
if kw_m:
|
||||
data["abstact_cn_context"] = sec_body[: kw_m.start()].strip()
|
||||
data["abstract_cn_keywords"] = kw_m.group(1).strip()
|
||||
else:
|
||||
data["abstact_cn_context"] = sec_body
|
||||
|
||||
# ── 英文摘要 ──
|
||||
abs_en = _find_section(content, ["Abstract"])
|
||||
if abs_en:
|
||||
sec_body = _get_section_body(content, abs_en)
|
||||
kw_m = re.search(
|
||||
r"\*\*Key words[::]?\s*\*\*\s*(.*?)$", sec_body, re.MULTILINE
|
||||
)
|
||||
if kw_m:
|
||||
data["abstract_en_context"] = sec_body[: kw_m.start()].strip()
|
||||
data["abstract_en_keywords"] = kw_m.group(1).strip()
|
||||
else:
|
||||
data["abstract_en_context"] = sec_body
|
||||
|
||||
# ── 正文(从绪论/引言到致谢/参考文献/附录) ──
|
||||
body_start = _find_section(content, body_start_kw)
|
||||
if body_start:
|
||||
body_content = content[body_start[0] :]
|
||||
body_end = _find_section(body_content, body_end_kw)
|
||||
if body_end:
|
||||
body_content = body_content[: body_end[0]]
|
||||
data["body_md"] = body_content.strip()
|
||||
else:
|
||||
data["body_md"] = ""
|
||||
|
||||
# ── 致谢 ──
|
||||
ack = _find_section(content, ["致谢"])
|
||||
if ack:
|
||||
data["acknowledgement"] = content[ack[0] : ack[1]].strip()
|
||||
|
||||
# ── 参考文献 ──
|
||||
ref = _find_section(content, ["参考文献"])
|
||||
if ref:
|
||||
data["reference"] = content[ref[0] : ref[1]].strip()
|
||||
|
||||
# ── 附录 ──
|
||||
app = _find_section(content, ["附录"])
|
||||
if app:
|
||||
data["appendix"] = content[app[0] : app[1]].strip()
|
||||
|
||||
return data
|
||||
132
transit/renderer.py
Normal file
132
transit/renderer.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
论文生成编排器。
|
||||
|
||||
组装 配置 + 解析 + 模板渲染 + 正文注入 的完整流水线。
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from docxtpl import DocxTemplate
|
||||
from docx import Document
|
||||
|
||||
from .config import load_config, ThesisConfig
|
||||
from .parser import parse_markdown
|
||||
from .body import body_to_paragraphs, replace_placeholder
|
||||
|
||||
|
||||
_TEXT_FIELDS = [
|
||||
"title",
|
||||
"abstact_cn_context",
|
||||
"abstract_cn_keywords",
|
||||
"abstract_en_context",
|
||||
"abstract_en_keywords",
|
||||
"acknowledgement",
|
||||
"reference",
|
||||
"appendix",
|
||||
"student_name",
|
||||
"student_id",
|
||||
"college",
|
||||
"major",
|
||||
"class",
|
||||
"advisor",
|
||||
"advisor_title",
|
||||
]
|
||||
|
||||
|
||||
def generate_thesis(
|
||||
template_path: str | Path,
|
||||
data_path: str | Path,
|
||||
config_path: str | Path | None = None,
|
||||
output_path: str | Path = "output.docx",
|
||||
) -> dict:
|
||||
"""执行从数据到 Word 的完整论文生成流程。
|
||||
|
||||
Parameters
|
||||
----------
|
||||
template_path : str | Path
|
||||
docxtpl 模板文件路径(.docx)。
|
||||
data_path : str | Path
|
||||
Markdown 论文正文文件路径(.md)。
|
||||
config_path : str | Path | None
|
||||
TOML 配置文件路径。为 ``None`` 时尝试自动查找。
|
||||
output_path : str | Path
|
||||
输出 Word 文件路径。
|
||||
"""
|
||||
data_path = Path(data_path)
|
||||
|
||||
# 1. 加载配置
|
||||
if config_path is None:
|
||||
candidates = [
|
||||
Path("thesis_config.toml"),
|
||||
data_path.with_suffix(".toml"),
|
||||
]
|
||||
config_path = next((p for p in candidates if p.exists()), None)
|
||||
|
||||
config: ThesisConfig | None = None
|
||||
if config_path and Path(config_path).exists():
|
||||
config = load_config(config_path)
|
||||
print(f"[配置] 配置文件: {config_path}")
|
||||
else:
|
||||
config = ThesisConfig()
|
||||
print("[配置] 未找到配置文件,使用默认值。")
|
||||
|
||||
# 2. 解析 Markdown
|
||||
with open(data_path, "r", encoding="utf-8") as f:
|
||||
md_text = f.read()
|
||||
|
||||
context = parse_markdown(
|
||||
md_text,
|
||||
body_start_kw=config.body_start_keywords,
|
||||
body_end_kw=config.body_end_keywords,
|
||||
)
|
||||
|
||||
# 3. 合并配置 → 上下文(配置优先)
|
||||
for k, v in config.to_dict().items():
|
||||
if k == "title" and config.title_from_md and context.get("title"):
|
||||
continue # 以 markdown 标题为准
|
||||
if v != "<None>":
|
||||
context[k] = v
|
||||
|
||||
# 4. 用 defaultdict 兜底缺失键
|
||||
ctx = defaultdict(lambda: "<None>", context)
|
||||
|
||||
# 5. 解析正文为段落列表
|
||||
body_md = ctx.get("body_md", "")
|
||||
body_paragraphs = body_to_paragraphs(body_md) if body_md else []
|
||||
|
||||
# 6. 占位符
|
||||
ctx["body_placeholder"] = "__CONTEXT_PLACEHOLDER__"
|
||||
|
||||
# 7. 渲染模板
|
||||
doc = DocxTemplate(str(template_path))
|
||||
doc.render(ctx)
|
||||
|
||||
# 8. 保存临时文件,再做后处理
|
||||
temp_path = Path(output_path).with_suffix(".tmp")
|
||||
doc.save(str(temp_path))
|
||||
|
||||
# 9. 正文注入
|
||||
final_doc = Document(str(temp_path))
|
||||
replace_placeholder(final_doc, "__CONTEXT_PLACEHOLDER__", body_paragraphs)
|
||||
final_doc.save(str(output_path))
|
||||
temp_path.unlink(missing_ok=True)
|
||||
|
||||
print(f"[完成] 论文生成完成: {output_path}")
|
||||
|
||||
# 10. 字段填充报告
|
||||
print("\n--- 字段填充情况 ---")
|
||||
for key in _TEXT_FIELDS:
|
||||
val = ctx[key]
|
||||
if val == "<None>":
|
||||
print(f" [缺失] {key}")
|
||||
else:
|
||||
preview = str(val)[:60].replace("\n", " ")
|
||||
print(f" [OK] {key}: {preview}...")
|
||||
|
||||
missing = [k for k in _TEXT_FIELDS if ctx[k] == "<None>"]
|
||||
if missing:
|
||||
print("\n[警告] 以下字段缺失,已填充 '<None>':")
|
||||
for f in missing:
|
||||
print(f" - {f}")
|
||||
|
||||
return dict(ctx)
|
||||
Reference in New Issue
Block a user