"""Convert Markdown graduation thesis → formatted Word .docx.
Parses markdown line-by-line and writes a python-docx document that
complies with 桂林理工大学 理工类毕业设计(论文)格式要求.
"""
from __future__ import annotations
import re
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.shared import Cm, Pt, RGBColor
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from .config import ThesisFormat
# ── font helpers ─────────────────────────────────────────────────────────
def _set_font(
run: Run,
cn_font: str,
en_font: str | None = None,
size: float | None = None,
bold: bool | None = None,
italic: bool | None = None,
):
if en_font:
run.font.name = en_font
if cn_font:
rpr = run._element.get_or_add_rPr()
rfonts = rpr.find(qn("w:rFonts"))
if rfonts is None:
rfonts = parse_xml(f'')
rpr.insert(0, rfonts)
rfonts.set(qn("w:eastAsia"), cn_font)
if size is not None:
run.font.size = Pt(size)
if bold is not None:
run.font.bold = bold
if italic is not None:
run.font.italic = italic
def _set_spacing(p: Paragraph, before: int = 0, after: int = 0,
line_spacing: float = 1.0):
pf = p.paragraph_format
pf.space_before = Pt(before)
pf.space_after = Pt(after)
pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
pf.line_spacing = line_spacing
def _set_indent(p: Paragraph, chars: int = 2):
if chars > 0:
p.paragraph_format.first_line_indent = Cm(chars * 0.37)
def _set_page_number_fmt(section, fmt: str):
sect_pr = section._sectPr
el = sect_pr.find(qn("w:pgNumType"))
if el is None:
el = parse_xml(f'')
sect_pr.append(el)
el.set(qn("w:fmt"), fmt)
def _setup_footer(section, roman: bool):
footer = section.footer
footer.is_linked_to_previous = False
# clear default empty paragraph runs to avoid extra blank line
for p in footer.paragraphs:
for r in p.runs:
r.text = ""
p = footer.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_before = Pt(0)
p.paragraph_format.space_after = Pt(0)
r = p.add_run()
_set_font(r, "宋体", "Times New Roman", size=9)
r._element.append(parse_xml(
f''))
r2 = p.add_run()
r2._element.append(parse_xml(
f' PAGE '))
r3 = p.add_run()
r3._element.append(parse_xml(
f''))
_set_page_number_fmt(section, "lowerRoman" if roman else "decimal")
# ── inline markdown parser ───────────────────────────────────────────────
def _parse_inline(text: str):
"""Tokenise line → list of (text, attrs) tuples."""
tokens: list[tuple[str, dict]] = []
buf = ""
i = 0
n = len(text)
def flush():
nonlocal buf
if buf:
tokens.append((buf, {}))
buf = ""
while i < n:
ch = text[i]
# `code`
if ch == "`":
flush()
j = text.find("`", i + 1)
if j == -1:
buf += ch
i += 1
continue
tokens.append((text[i + 1:j], {"code": True}))
i = j + 1
continue
# **bold**
if text[i:i + 2] == "**":
flush()
j = text.find("**", i + 2)
if j == -1:
buf += ch
i += 1
continue
inner = text[i + 2:j]
sub = _parse_inline(inner)
for t, a in sub:
a["bold"] = True
tokens.append((t, a))
i = j + 2
continue
# *italic* (single star, not **)
if ch == "*" and i + 1 < n and text[i + 1] != "*":
flush()
j = text.find("*", i + 1)
if j == -1:
buf += ch
i += 1
continue
tokens.append((text[i + 1:j], {"italic": True}))
i = j + 1
continue
buf += ch
i += 1
flush()
return tokens
def _add_inline(p: Paragraph, tokens: list, cfg: ThesisFormat,
size: float | None = None, bold: bool = False):
for text, attrs in tokens:
run = p.add_run(text)
b = bold or attrs.get("bold", False)
it = attrs.get("italic", False)
code = attrs.get("code", False)
cn = cfg.font_code if code else cfg.font_cn
en = cfg.font_code if code else cfg.font_en
_set_font(run, cn, en, size=size or cfg.size_body,
bold=b, italic=it if not b else None)
# ── block-level parser ───────────────────────────────────────────────────
def _parse_blocks(text: str):
lines = text.split("\n")
blocks: list[dict] = []
i, n = 0, len(lines)
while i < n:
line = lines[i]
# thematic break
if line.strip() == "---":
blocks.append({"type": "thematic_break"})
i += 1
continue
# fenced code block
if line.strip().startswith("```") or line.strip().startswith("~~~"):
fence = line.strip()[:3]
info = line.strip()[3:].strip()
code_lines: list[str] = []
i += 1
while i < n and not lines[i].strip().startswith(fence):
code_lines.append(lines[i])
i += 1
i += 1
blocks.append({"type": "block_code", "info": info,
"raw": "\n".join(code_lines)})
continue
# heading
m = re.match(r"^(#{1,6})\s+(.+)$", line)
if m:
blocks.append({"type": "heading",
"level": len(m.group(1)),
"text": m.group(2).strip()})
i += 1
continue
# blockquote
if line.strip().startswith(">"):
ql: list[str] = []
while i < n and (lines[i].strip().startswith(">")
or lines[i].strip() == ""):
ql.append(re.sub(r"^>\s?", "", lines[i]))
i += 1
blocks.append({"type": "block_quote",
"text": "\n".join(ql).strip()})
continue
# list
if re.match(r"^(\s*)([-*+]\s|\d+\.\s)", line):
items: list[str] = []
while i < n:
if re.match(r"^(\s*)([-*+]\s|\d+\.\s)", lines[i]):
t = re.sub(r"^(\s*)[-*+]\s|\d+\.\s", "", lines[i], 1)
items.append(t)
i += 1
while i < n and lines[i].strip() \
and not re.match(r"^(\s*)([-*+]\s|\d+\.\s)",
lines[i]):
if lines[i][0] in " \t":
items[-1] += " " + lines[i].strip()
i += 1
else:
break
elif lines[i].strip() == "":
i += 1
else:
break
blocks.append({"type": "list", "items": items})
continue
# blank
if line.strip() == "":
i += 1
continue
# paragraph (accumulate)
para: list[str] = []
while i < n and lines[i].strip():
para.append(lines[i])
i += 1
t = "\n".join(para).strip()
if t:
blocks.append({"type": "paragraph", "text": t})
return blocks
# ── converter ────────────────────────────────────────────────────────────
class ThesisConverter:
"""Markdown → 理工类毕业论文 Word 文档。
处理流程:
1. 解析 MD → blocks
2. 扫描 blocks 提取论文题目(H1)
3. 按章节类别写入带正确格式的 Word
4. 每章自动分页、页面网格、字体字号严格按学校要求
"""
def __init__(self, config: ThesisFormat | None = None):
self.config = config or ThesisFormat()
self.doc = Document()
self._thesis_title: str = "" # 论文题目(来自 H1)
self._has_title = False # 是否已保存论文题目
self._section_break_added = False # 是否插入过正文分节符
# ── public API ──────────────────────────────────────────────────
def convert(self, md_path: str | Path, docx_path: str | Path):
text = Path(md_path).read_text(encoding="utf-8")
text = self._strip_manual_toc(text)
blocks = _parse_blocks(text)
# extract H1 thesis title
for blk in blocks:
if blk["type"] == "heading" and blk["level"] == 1:
self._thesis_title = blk["text"]
break
self._setup_document()
self._process_blocks(blocks)
self.doc.save(str(docx_path))
# ── strip manual TOC ────────────────────────────────────────────
@staticmethod
def _strip_manual_toc(text: str) -> str:
lines = text.split("\n")
toc_start = -1
sep_end = -1
for i, line in enumerate(lines):
if re.search(r"[目目]\s*[次次]", line) and line.startswith("#"):
toc_start = i
if toc_start >= 0 and line.strip() == "---" and i > toc_start:
sep_end = i
break
if toc_start >= 0 and sep_end > toc_start:
kept = lines[:toc_start + 1]
kept.append("")
kept.extend(lines[sep_end:])
return "\n".join(kept)
return text
# ── page setup ──────────────────────────────────────────────────
def _setup_document(self):
cfg = self.config
sec = self.doc.sections[0]
self._apply_page_setup(sec, roman=True)
# default font
styles = self.doc.styles
normal = styles["Normal"]
rpr = normal.element.get_or_add_rPr()
rfonts = rpr.find(qn("w:rFonts"))
if rfonts is None:
rfonts = parse_xml(f'')
rpr.insert(0, rfonts)
rfonts.set(qn("w:ascii"), cfg.font_en)
rfonts.set(qn("w:hAnsi"), cfg.font_en)
rfonts.set(qn("w:eastAsia"), cfg.font_cn)
rfonts.set(qn("w:cs"), cfg.font_en)
sz = rpr.find(qn("w:sz"))
if sz is None:
sz = parse_xml(
f'')
rpr.append(sz)
pf = normal.paragraph_format
pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
pf.line_spacing = cfg.line_spacing_body
self._config_heading_styles()
_setup_footer(sec, roman=True)
def _config_heading_styles(self):
"""Configure Heading 1/2/3 built-in styles to match thesis formatting.
This ensures Word's TOC field can detect headings and auto-generate
the table of contents correctly.
"""
cfg = self.config
styles = self.doc.styles
# ── Heading 1 = 章 (三号宋体加粗左) ──────────────────────────
h1 = styles["Heading 1"]
h1.font.name = cfg.font_heading_en
rpr = h1.element.get_or_add_rPr()
rfonts = rpr.find(qn("w:rFonts"))
if rfonts is None:
rfonts = parse_xml(f'')
rpr.insert(0, rfonts)
rfonts.set(qn("w:eastAsia"), cfg.font_cn_heading)
h1.font.size = Pt(cfg.size_chapter)
h1.font.bold = True
h1.font.color.rgb = RGBColor(0, 0, 0)
h1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
h1.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
h1.paragraph_format.line_spacing = cfg.line_spacing_heading
h1.paragraph_format.space_before = Pt(0)
h1.paragraph_format.space_after = Pt(0)
# Keep with next + page break before
pPr = h1.element.get_or_add_pPr()
keep_next = parse_xml(f'')
pPr.append(keep_next)
# ── Heading 2 = 节 (小三号宋体加粗左) ────────────────────────
h2 = styles["Heading 2"]
h2.font.name = cfg.font_heading_en
rpr2 = h2.element.get_or_add_rPr()
rfonts2 = rpr2.find(qn("w:rFonts"))
if rfonts2 is None:
rfonts2 = parse_xml(f'')
rpr2.insert(0, rfonts2)
rfonts2.set(qn("w:eastAsia"), cfg.font_cn_heading)
h2.font.size = Pt(cfg.size_section)
h2.font.bold = True
h2.font.color.rgb = RGBColor(0, 0, 0)
h2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
h2.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
h2.paragraph_format.line_spacing = cfg.line_spacing_heading
h2.paragraph_format.space_before = Pt(0)
h2.paragraph_format.space_after = Pt(0)
# ── Heading 3 = 条 (四号宋体加粗左) ──────────────────────────
h3 = styles["Heading 3"]
h3.font.name = cfg.font_heading_en
rpr3 = h3.element.get_or_add_rPr()
rfonts3 = rpr3.find(qn("w:rFonts"))
if rfonts3 is None:
rfonts3 = parse_xml(f'')
rpr3.insert(0, rfonts3)
rfonts3.set(qn("w:eastAsia"), cfg.font_cn_heading)
h3.font.size = Pt(cfg.size_subsection)
h3.font.bold = True
h3.font.color.rgb = RGBColor(0, 0, 0)
h3.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
h3.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
h3.paragraph_format.line_spacing = cfg.line_spacing_heading
h3.paragraph_format.space_before = Pt(0)
h3.paragraph_format.space_after = Pt(0)
def _add_section_break_main(self):
sec = self.doc.add_section()
self._apply_page_setup(sec, roman=False)
self._section_break_added = True
def _apply_page_setup(self, sec, roman: bool = True):
"""Apply margins, grid, and footer to a section."""
cfg = self.config
sec.page_width = Cm(cfg.page_width)
sec.page_height = Cm(cfg.page_height)
sect_pr = sec._sectPr
for el in list(sect_pr):
if el.tag in (qn("w:pgMar"), qn("w:docGrid")):
sect_pr.remove(el)
pgMar = parse_xml(
f'')
sect_pr.append(pgMar)
text_height_mm = (cfg.page_height - cfg.margin_top
- cfg.margin_bottom) * 10
line_pitch = int(text_height_mm / cfg.grid_lines_per_page * 56.7)
text_width_mm = (cfg.page_width - cfg.margin_left
- cfg.margin_right) * 10
char_pitch = int(text_width_mm / cfg.grid_chars_per_line * 56.7)
dg = parse_xml(
f'')
sect_pr.append(dg)
_setup_footer(sec, roman=roman)
# ── block processing ────────────────────────────────────────────
def _process_blocks(self, blocks):
# State machine:
# before_abstract → abstract_cn → abstract_en → toc → main
state = "before_abstract"
self._seen_first_chapter = False
for blk in blocks:
t = blk["type"]
if t == "heading" and blk["level"] == 1:
# Skip H1 (thesis title) — not rendered on Chinese abstract
continue
if t == "heading" and blk["level"] == 2:
txt = blk["text"].strip()
if txt.replace(" ", "") == "摘 要".replace(" ", ""):
state = "abstract_cn"
self._add_abstract_title("摘 要")
continue
if txt == "Abstract":
self._add_abstract_title_en()
state = "abstract_en"
continue
if "目" in txt and "次" in txt:
state = "toc"
self._add_toc("目 次")
continue
# Normal chapter
if state in ("before_abstract", "abstract_cn", "abstract_en", "toc"):
self._add_section_break_main()
state = "main"
self._add_page_break_if_not_first()
self._add_chapter(txt)
continue
if t == "heading" and blk["level"] == 3:
self._ensure_main_section(state)
state = "main"
txt = blk["text"].strip()
if re.match(r"^\d+\.\d+\.\d+\s", txt):
self._add_subsection(txt)
else:
self._add_section(txt)
continue
if t == "heading" and blk["level"] >= 4:
self._ensure_main_section(state)
state = "main"
# headings below 3 → body-style bold
self._add_body_para(blk["text"], bold=True, indent=False)
continue
# paragraphs / code / blockquote / list / thematic_break
if t == "paragraph":
txt = blk["text"]
if not txt.strip():
continue
if state == "abstract_cn":
if txt.startswith("关键词:"):
self._add_keywords(txt, cn=True)
else:
self._add_abstract_body(txt)
continue
if state == "abstract_en":
if txt.startswith("Key words:"):
self._add_keywords(txt, cn=False)
else:
self._add_abstract_body(txt) # 英文摘要正文
continue
# Normal body
self._ensure_main_section(state)
state = "main"
if txt.startswith("关键词:"):
self._add_keywords(txt, cn=True)
elif txt.startswith("Key words:"):
self._add_keywords(txt, cn=False)
else:
self._add_body_para(txt)
continue
if t == "block_code":
# code can appear in abstract or main — skip abstract code
if state in ("abstract_cn", "abstract_en", "toc"):
continue
self._ensure_main_section(state)
state = "main"
self._process_code(blk)
continue
if t == "block_quote":
txt = blk.get("text", "").strip()
if not txt:
continue
self._ensure_main_section(state)
state = "main"
self._add_body_para(txt)
continue
if t == "list":
self._ensure_main_section(state)
state = "main"
for item in blk.get("items", []):
self._add_body_para("• " + item)
continue
if t == "thematic_break":
# In front matter or already processed — handled by state
continue
def _ensure_main_section(self, state: str):
if state in ("before_abstract", "abstract_cn", "abstract_en", "toc"):
if not self._section_break_added:
self._add_section_break_main()
def _add_page_break_if_not_first(self):
if self._seen_first_chapter:
self.doc.add_page_break()
else:
self._seen_first_chapter = True
# ══════════════════════════════════════════════════════════════
# rendering methods
# ══════════════════════════════════════════════════════════════
# ── abstract ──────────────────────────────────────────────────
def _add_abstract_title(self, text: str):
"""摘要题头:三号宋体加粗居中 (3.3节)"""
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 1"]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
run = p.add_run(text)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_abstract_title, bold=True)
# blank line after title (§3.3)
self.doc.add_paragraph()
def _add_abstract_body(self, text: str):
"""摘要正文:小四宋体,首行缩进2字符"""
cfg = self.config
p = self.doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_body)
_set_indent(p, cfg.first_line_indent_chars)
tokens = _parse_inline(text)
_add_inline(p, tokens, cfg)
def _add_abstract_title_en(self):
"""英文摘要页:标题+论文题目+作者署名 (2.3节)"""
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 1"]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
run = p.add_run("Abstract")
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_abstract_title, bold=True)
self.doc.add_paragraph()
# Thesis title in English (centered)
if self._thesis_title:
# crude English translation placeholder — user should replace
p2 = self.doc.add_paragraph()
p2.alignment = WD_ALIGN_PARAGRAPH.CENTER
_set_spacing(p2, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
r = p2.add_run(self._thesis_title)
_set_font(r, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_section, bold=True)
# Author & teacher line
p3 = self.doc.add_paragraph()
p3.alignment = WD_ALIGN_PARAGRAPH.CENTER
_set_spacing(p3, before=6, after=6,
line_spacing=cfg.line_spacing_heading)
r = p3.add_run("Student: \tTeacher: ")
_set_font(r, cfg.font_cn, cfg.font_en, size=cfg.size_body)
# ── keywords ──────────────────────────────────────────────────
def _add_keywords(self, text: str, cn: bool):
"""关键词:小四宋体加粗顶格 (3.3节)"""
cfg = self.config
p = self.doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_body)
label = cfg.keywords_label_cn if cn else cfg.keywords_label_en
m = re.match(r"\*\*" + re.escape(label) + r"\*\*(.*)", text)
if m:
run = p.add_run(label)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_keyword_label, bold=True)
rest = m.group(1).strip()
tokens = _parse_inline(rest)
_add_inline(p, tokens, cfg, size=cfg.size_keyword_label)
else:
tokens = _parse_inline(text)
_add_inline(p, tokens, cfg, size=cfg.size_keyword_label)
# ── TOC ────────────────────────────────────────────────────────
def _add_toc(self, title: str):
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 1"]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
run = p.add_run(title)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_abstract_title, bold=True)
self.doc.add_paragraph() # blank line
# Word TOC field
p2 = self.doc.add_paragraph()
_set_spacing(p2, before=0, after=0,
line_spacing=cfg.line_spacing_body)
r = p2.add_run()
r._element.append(parse_xml(
f''))
r2 = p2.add_run()
r2._element.append(parse_xml(
f''
' TOC \\o "1-3" \\h \\z \\u '))
r3 = p2.add_run()
r3._element.append(parse_xml(
f''))
r4 = p2.add_run("(请右键此处 > 更新域)")
_set_font(r4, cfg.font_cn, cfg.font_en, size=cfg.size_body)
r5 = p2.add_run()
r5._element.append(parse_xml(
f''))
# ── chapter headings (第一层次) ──────────────────────────────
def _add_chapter(self, text: str):
"""章标题:三号宋体加粗,顶格 (§3.2 表3)"""
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 1"]
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
# Ensure double space between number and title (§2.5.2 表1)
formatted = re.sub(r"^(\d+)\s+", r"\1 ", text)
run = p.add_run(formatted)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_chapter, bold=True)
# ── section heading (第二层次) ───────────────────────────────
def _add_section(self, text: str):
"""节标题:小三号宋体加粗,顶格 (§3.2 表3)"""
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 2"]
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
# Single space between number and title (§2.5.2 表1)
formatted = re.sub(r"^(\d+\.\d+)\s+", r"\1 ", text)
run = p.add_run(formatted)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_section, bold=True)
# ── subsection heading (第三层次) ──────────────────────────
def _add_subsection(self, text: str):
"""条标题:四号宋体加粗,顶格 (§3.2 表3)"""
cfg = self.config
p = self.doc.add_paragraph()
p.style = self.doc.styles["Heading 3"]
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_heading)
run = p.add_run(text)
_set_font(run, cfg.font_cn_heading, cfg.font_heading_en,
size=cfg.size_subsection, bold=True)
# ── body paragraph ──────────────────────────────────────────
def _add_body_para(self, text: str, bold: bool = False,
indent: bool = True):
"""正文:小四宋体,首行缩进2字符 (§3.2)"""
cfg = self.config
p = self.doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_body)
if indent:
_set_indent(p, cfg.first_line_indent_chars)
tokens = _parse_inline(text)
_add_inline(p, tokens, cfg, bold=bold)
# ── code block ──────────────────────────────────────────────
def _process_code(self, blk: dict):
code = blk.get("raw", "")
if not code.strip():
return
cfg = self.config
p = self.doc.add_paragraph()
_set_spacing(p, before=0, after=0,
line_spacing=cfg.line_spacing_code)
pf = p.paragraph_format
pf.left_indent = Cm(0.75)
pPr = p._element.get_or_add_pPr()
shd = parse_xml(
f'')
pPr.append(shd)
for line in code.split("\n"):
if line:
run = p.add_run(line)
_set_font(run, cfg.font_code, cfg.font_code,
size=cfg.size_code)
p.add_run("\n")