"""Convert Markdown graduation thesis → formatted Word .docx. Parses markdown line-by-line and writes a python-docx document that complies with 桂林理工大学 理工类毕业设计(论文)格式要求. """ from __future__ import annotations import re from pathlib import Path from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING from docx.oxml import parse_xml from docx.oxml.ns import nsdecls, qn from docx.shared import Cm, Pt, RGBColor from docx.text.paragraph import Paragraph from docx.text.run import Run from .config import ThesisFormat # ── font helpers ───────────────────────────────────────────────────────── def _set_font( run: Run, cn_font: str, en_font: str | None = None, size: float | None = None, bold: bool | None = None, italic: bool | None = None, ): if en_font: run.font.name = en_font if cn_font: rpr = run._element.get_or_add_rPr() rfonts = rpr.find(qn("w:rFonts")) if rfonts is None: rfonts = parse_xml(f'') rpr.insert(0, rfonts) rfonts.set(qn("w:eastAsia"), cn_font) if size is not None: run.font.size = Pt(size) if bold is not None: run.font.bold = bold if italic is not None: run.font.italic = italic def _set_spacing(p: Paragraph, before: int = 0, after: int = 0, line_spacing: float = 1.0): pf = p.paragraph_format pf.space_before = Pt(before) pf.space_after = Pt(after) pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE pf.line_spacing = line_spacing def _set_indent(p: Paragraph, chars: int = 2): if chars > 0: p.paragraph_format.first_line_indent = Cm(chars * 0.37) def _set_page_number_fmt(section, fmt: str): sect_pr = section._sectPr el = sect_pr.find(qn("w:pgNumType")) if el is None: el = parse_xml(f'') sect_pr.append(el) el.set(qn("w:fmt"), fmt) def _setup_footer(section, roman: bool): footer = section.footer footer.is_linked_to_previous = False # clear default empty paragraph runs to avoid extra blank line for p in footer.paragraphs: for r in p.runs: r.text = "" p = footer.paragraphs[0] p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_before = Pt(0) p.paragraph_format.space_after = Pt(0) r = p.add_run() _set_font(r, "宋体", "Times New Roman", size=9) r._element.append(parse_xml( f'')) r2 = p.add_run() r2._element.append(parse_xml( f' PAGE ')) r3 = p.add_run() r3._element.append(parse_xml( f'')) _set_page_number_fmt(section, "lowerRoman" if roman else "decimal") # ── inline markdown parser ─────────────────────────────────────────────── def _parse_inline(text: str): """Tokenise line → list of (text, attrs) tuples.""" tokens: list[tuple[str, dict]] = [] buf = "" i = 0 n = len(text) def flush(): nonlocal buf if buf: tokens.append((buf, {})) buf = "" while i < n: ch = text[i] # `code` if ch == "`": flush() j = text.find("`", i + 1) if j == -1: buf += ch i += 1 continue tokens.append((text[i + 1:j], {"code": True})) i = j + 1 continue # **bold** if text[i:i + 2] == "**": flush() j = text.find("**", i + 2) if j == -1: buf += ch i += 1 continue inner = text[i + 2:j] sub = _parse_inline(inner) for t, a in sub: a["bold"] = True tokens.append((t, a)) i = j + 2 continue # *italic* (single star, not **) if ch == "*" and i + 1 < n and text[i + 1] != "*": flush() j = text.find("*", i + 1) if j == -1: buf += ch i += 1 continue tokens.append((text[i + 1:j], {"italic": True})) i = j + 1 continue buf += ch i += 1 flush() return tokens def _add_inline(p: Paragraph, tokens: list, cfg: ThesisFormat, size: float | None = None, bold: bool = False): for text, attrs in tokens: run = p.add_run(text) b = bold or attrs.get("bold", False) it = attrs.get("italic", False) code = attrs.get("code", False) cn = cfg.font_code if code else cfg.font_cn en = cfg.font_code if code else cfg.font_en _set_font(run, cn, en, size=size or cfg.size_body, bold=b, italic=it if not b else None) # ── block-level parser ─────────────────────────────────────────────────── def _parse_blocks(text: str): lines = text.split("\n") blocks: list[dict] = [] i, n = 0, len(lines) while i < n: line = lines[i] # thematic break if line.strip() == "---": blocks.append({"type": "thematic_break"}) i += 1 continue # fenced code block if line.strip().startswith("```") or line.strip().startswith("~~~"): fence = line.strip()[:3] info = line.strip()[3:].strip() code_lines: list[str] = [] i += 1 while i < n and not lines[i].strip().startswith(fence): code_lines.append(lines[i]) i += 1 i += 1 blocks.append({"type": "block_code", "info": info, "raw": "\n".join(code_lines)}) continue # heading m = re.match(r"^(#{1,6})\s+(.+)$", line) if m: blocks.append({"type": "heading", "level": len(m.group(1)), "text": m.group(2).strip()}) i += 1 continue # blockquote if line.strip().startswith(">"): ql: list[str] = [] while i < n and (lines[i].strip().startswith(">") or lines[i].strip() == ""): ql.append(re.sub(r"^>\s?", "", lines[i])) i += 1 blocks.append({"type": "block_quote", "text": "\n".join(ql).strip()}) continue # list if re.match(r"^(\s*)([-*+]\s|\d+\.\s)", line): items: list[str] = [] while i < n: if re.match(r"^(\s*)([-*+]\s|\d+\.\s)", lines[i]): t = re.sub(r"^(\s*)[-*+]\s|\d+\.\s", "", lines[i], 1) items.append(t) i += 1 while i < n and lines[i].strip() \ and not re.match(r"^(\s*)([-*+]\s|\d+\.\s)", lines[i]): if lines[i][0] in " \t": items[-1] += " " + lines[i].strip() i += 1 else: break elif lines[i].strip() == "": i += 1 else: break blocks.append({"type": "list", "items": items}) continue # blank if line.strip() == "": i += 1 continue # paragraph (accumulate) para: list[str] = [] while i < n and lines[i].strip(): para.append(lines[i]) i += 1 t = "\n".join(para).strip() if t: blocks.append({"type": "paragraph", "text": t}) return blocks # ── converter ──────────────────────────────────────────────────────────── class ThesisConverter: """Markdown → 理工类毕业论文 Word 文档。 处理流程: 1. 解析 MD → blocks 2. 扫描 blocks 提取论文题目(H1) 3. 按章节类别写入带正确格式的 Word 4. 每章自动分页、页面网格、字体字号严格按学校要求 """ def __init__(self, config: ThesisFormat | None = None): self.config = config or ThesisFormat() self.doc = Document() self._thesis_title: str = "" # 论文题目(来自 H1) self._has_title = False # 是否已保存论文题目 self._section_break_added = False # 是否插入过正文分节符 # ── public API ────────────────────────────────────────────────── def convert(self, md_path: str | Path, docx_path: str | Path): text = Path(md_path).read_text(encoding="utf-8") text = self._strip_manual_toc(text) blocks = _parse_blocks(text) # extract H1 thesis title for blk in blocks: if blk["type"] == "heading" and blk["level"] == 1: self._thesis_title = blk["text"] break self._setup_document() self._process_blocks(blocks) self.doc.save(str(docx_path)) # ── strip manual TOC ──────────────────────────────────────────── @staticmethod def _strip_manual_toc(text: str) -> str: lines = text.split("\n") toc_start = -1 sep_end = -1 for i, line in enumerate(lines): if re.search(r"[目目]\s*[次次]", line) and line.startswith("#"): toc_start = i if toc_start >= 0 and line.strip() == "---" and i > toc_start: sep_end = i break if toc_start >= 0 and sep_end > toc_start: kept = lines[:toc_start + 1] kept.append("") kept.extend(lines[sep_end:]) return "\n".join(kept) return text # ── page setup ────────────────────────────────────────────────── def _setup_document(self): cfg = self.config sec = self.doc.sections[0] self._apply_page_setup(sec, roman=True) # default font styles = self.doc.styles normal = styles["Normal"] rpr = normal.element.get_or_add_rPr() rfonts = rpr.find(qn("w:rFonts")) if rfonts is None: rfonts = parse_xml(f'') rpr.insert(0, rfonts) rfonts.set(qn("w:ascii"), cfg.font_en) rfonts.set(qn("w:hAnsi"), cfg.font_en) rfonts.set(qn("w:eastAsia"), cfg.font_cn) rfonts.set(qn("w:cs"), cfg.font_en) sz = rpr.find(qn("w:sz")) if sz is None: sz = parse_xml( f'') rpr.append(sz) pf = normal.paragraph_format pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE pf.line_spacing = cfg.line_spacing_body self._config_heading_styles() _setup_footer(sec, roman=True) def _config_heading_styles(self): """Configure Heading 1/2/3 built-in styles to match thesis formatting. This ensures Word's TOC field can detect headings and auto-generate the table of contents correctly. """ cfg = self.config styles = self.doc.styles # ── Heading 1 = 章 (三号宋体加粗左) ────────────────────────── h1 = styles["Heading 1"] h1.font.name = cfg.font_heading_en rpr = h1.element.get_or_add_rPr() rfonts = rpr.find(qn("w:rFonts")) if rfonts is None: rfonts = parse_xml(f'') rpr.insert(0, rfonts) rfonts.set(qn("w:eastAsia"), cfg.font_cn_heading) h1.font.size = Pt(cfg.size_chapter) h1.font.bold = True h1.font.color.rgb = RGBColor(0, 0, 0) h1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT h1.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE h1.paragraph_format.line_spacing = cfg.line_spacing_heading h1.paragraph_format.space_before = Pt(0) h1.paragraph_format.space_after = Pt(0) # Keep with next + page break before pPr = h1.element.get_or_add_pPr() keep_next = parse_xml(f'') pPr.append(keep_next) # ── Heading 2 = 节 (小三号宋体加粗左) ──────────────────────── h2 = styles["Heading 2"] h2.font.name = cfg.font_heading_en rpr2 = h2.element.get_or_add_rPr() rfonts2 = rpr2.find(qn("w:rFonts")) if rfonts2 is None: rfonts2 = parse_xml(f'') rpr2.insert(0, rfonts2) rfonts2.set(qn("w:eastAsia"), cfg.font_cn_heading) h2.font.size = Pt(cfg.size_section) h2.font.bold = True h2.font.color.rgb = RGBColor(0, 0, 0) h2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT h2.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE h2.paragraph_format.line_spacing = cfg.line_spacing_heading h2.paragraph_format.space_before = Pt(0) h2.paragraph_format.space_after = Pt(0) # ── Heading 3 = 条 (四号宋体加粗左) ────────────────────────── h3 = styles["Heading 3"] h3.font.name = cfg.font_heading_en rpr3 = h3.element.get_or_add_rPr() rfonts3 = rpr3.find(qn("w:rFonts")) if rfonts3 is None: rfonts3 = parse_xml(f'') rpr3.insert(0, rfonts3) rfonts3.set(qn("w:eastAsia"), cfg.font_cn_heading) h3.font.size = Pt(cfg.size_subsection) h3.font.bold = True h3.font.color.rgb = RGBColor(0, 0, 0) h3.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT h3.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE h3.paragraph_format.line_spacing = cfg.line_spacing_heading h3.paragraph_format.space_before = Pt(0) h3.paragraph_format.space_after = Pt(0) def _add_section_break_main(self): sec = self.doc.add_section() self._apply_page_setup(sec, roman=False) self._section_break_added = True def _apply_page_setup(self, sec, roman: bool = True): """Apply margins, grid, and footer to a section.""" cfg = self.config sec.page_width = Cm(cfg.page_width) sec.page_height = Cm(cfg.page_height) sect_pr = sec._sectPr for el in list(sect_pr): if el.tag in (qn("w:pgMar"), qn("w:docGrid")): sect_pr.remove(el) pgMar = parse_xml( f'') sect_pr.append(pgMar) text_height_mm = (cfg.page_height - cfg.margin_top - cfg.margin_bottom) * 10 line_pitch = int(text_height_mm / cfg.grid_lines_per_page * 56.7) text_width_mm = (cfg.page_width - cfg.margin_left - cfg.margin_right) * 10 char_pitch = int(text_width_mm / cfg.grid_chars_per_line * 56.7) dg = parse_xml( f'') sect_pr.append(dg) _setup_footer(sec, roman=roman) # ── block processing ──────────────────────────────────────────── def _process_blocks(self, blocks): # State machine: # before_abstract → abstract_cn → abstract_en → toc → main state = "before_abstract" self._seen_first_chapter = False for blk in blocks: t = blk["type"] if t == "heading" and blk["level"] == 1: # Skip H1 (thesis title) — not rendered on Chinese abstract continue if t == "heading" and blk["level"] == 2: txt = blk["text"].strip() if txt.replace(" ", "") == "摘 要".replace(" ", ""): state = "abstract_cn" self._add_abstract_title("摘 要") continue if txt == "Abstract": self._add_abstract_title_en() state = "abstract_en" continue if "目" in txt and "次" in txt: state = "toc" self._add_toc("目 次") continue # Normal chapter if state in ("before_abstract", "abstract_cn", "abstract_en", "toc"): self._add_section_break_main() state = "main" self._add_page_break_if_not_first() self._add_chapter(txt) continue if t == "heading" and blk["level"] == 3: self._ensure_main_section(state) state = "main" txt = blk["text"].strip() if re.match(r"^\d+\.\d+\.\d+\s", txt): self._add_subsection(txt) else: self._add_section(txt) continue if t == "heading" and blk["level"] >= 4: self._ensure_main_section(state) state = "main" # headings below 3 → body-style bold self._add_body_para(blk["text"], bold=True, indent=False) continue # paragraphs / code / blockquote / list / thematic_break if t == "paragraph": txt = blk["text"] if not txt.strip(): continue if state == "abstract_cn": if txt.startswith("关键词:"): self._add_keywords(txt, cn=True) else: self._add_abstract_body(txt) continue if state == "abstract_en": if txt.startswith("Key words:"): self._add_keywords(txt, cn=False) else: self._add_abstract_body(txt) # 英文摘要正文 continue # Normal body self._ensure_main_section(state) state = "main" if txt.startswith("关键词:"): self._add_keywords(txt, cn=True) elif txt.startswith("Key words:"): self._add_keywords(txt, cn=False) else: self._add_body_para(txt) continue if t == "block_code": # code can appear in abstract or main — skip abstract code if state in ("abstract_cn", "abstract_en", "toc"): continue self._ensure_main_section(state) state = "main" self._process_code(blk) continue if t == "block_quote": txt = blk.get("text", "").strip() if not txt: continue self._ensure_main_section(state) state = "main" self._add_body_para(txt) continue if t == "list": self._ensure_main_section(state) state = "main" for item in blk.get("items", []): self._add_body_para("• " + item) continue if t == "thematic_break": # In front matter or already processed — handled by state continue def _ensure_main_section(self, state: str): if state in ("before_abstract", "abstract_cn", "abstract_en", "toc"): if not self._section_break_added: self._add_section_break_main() def _add_page_break_if_not_first(self): if self._seen_first_chapter: self.doc.add_page_break() else: self._seen_first_chapter = True # ══════════════════════════════════════════════════════════════ # rendering methods # ══════════════════════════════════════════════════════════════ # ── abstract ────────────────────────────────────────────────── def _add_abstract_title(self, text: str): """摘要题头:三号宋体加粗居中 (3.3节)""" cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 1"] p.alignment = WD_ALIGN_PARAGRAPH.CENTER _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) run = p.add_run(text) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_abstract_title, bold=True) # blank line after title (§3.3) self.doc.add_paragraph() def _add_abstract_body(self, text: str): """摘要正文:小四宋体,首行缩进2字符""" cfg = self.config p = self.doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_body) _set_indent(p, cfg.first_line_indent_chars) tokens = _parse_inline(text) _add_inline(p, tokens, cfg) def _add_abstract_title_en(self): """英文摘要页:标题+论文题目+作者署名 (2.3节)""" cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 1"] p.alignment = WD_ALIGN_PARAGRAPH.CENTER _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) run = p.add_run("Abstract") _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_abstract_title, bold=True) self.doc.add_paragraph() # Thesis title in English (centered) if self._thesis_title: # crude English translation placeholder — user should replace p2 = self.doc.add_paragraph() p2.alignment = WD_ALIGN_PARAGRAPH.CENTER _set_spacing(p2, before=0, after=0, line_spacing=cfg.line_spacing_heading) r = p2.add_run(self._thesis_title) _set_font(r, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_section, bold=True) # Author & teacher line p3 = self.doc.add_paragraph() p3.alignment = WD_ALIGN_PARAGRAPH.CENTER _set_spacing(p3, before=6, after=6, line_spacing=cfg.line_spacing_heading) r = p3.add_run("Student: \tTeacher: ") _set_font(r, cfg.font_cn, cfg.font_en, size=cfg.size_body) # ── keywords ────────────────────────────────────────────────── def _add_keywords(self, text: str, cn: bool): """关键词:小四宋体加粗顶格 (3.3节)""" cfg = self.config p = self.doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.LEFT _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_body) label = cfg.keywords_label_cn if cn else cfg.keywords_label_en m = re.match(r"\*\*" + re.escape(label) + r"\*\*(.*)", text) if m: run = p.add_run(label) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_keyword_label, bold=True) rest = m.group(1).strip() tokens = _parse_inline(rest) _add_inline(p, tokens, cfg, size=cfg.size_keyword_label) else: tokens = _parse_inline(text) _add_inline(p, tokens, cfg, size=cfg.size_keyword_label) # ── TOC ──────────────────────────────────────────────────────── def _add_toc(self, title: str): cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 1"] p.alignment = WD_ALIGN_PARAGRAPH.CENTER _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) run = p.add_run(title) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_abstract_title, bold=True) self.doc.add_paragraph() # blank line # Word TOC field p2 = self.doc.add_paragraph() _set_spacing(p2, before=0, after=0, line_spacing=cfg.line_spacing_body) r = p2.add_run() r._element.append(parse_xml( f'')) r2 = p2.add_run() r2._element.append(parse_xml( f'' ' TOC \\o "1-3" \\h \\z \\u ')) r3 = p2.add_run() r3._element.append(parse_xml( f'')) r4 = p2.add_run("(请右键此处 > 更新域)") _set_font(r4, cfg.font_cn, cfg.font_en, size=cfg.size_body) r5 = p2.add_run() r5._element.append(parse_xml( f'')) # ── chapter headings (第一层次) ────────────────────────────── def _add_chapter(self, text: str): """章标题:三号宋体加粗,顶格 (§3.2 表3)""" cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 1"] p.alignment = WD_ALIGN_PARAGRAPH.LEFT _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) # Ensure double space between number and title (§2.5.2 表1) formatted = re.sub(r"^(\d+)\s+", r"\1 ", text) run = p.add_run(formatted) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_chapter, bold=True) # ── section heading (第二层次) ─────────────────────────────── def _add_section(self, text: str): """节标题:小三号宋体加粗,顶格 (§3.2 表3)""" cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 2"] p.alignment = WD_ALIGN_PARAGRAPH.LEFT _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) # Single space between number and title (§2.5.2 表1) formatted = re.sub(r"^(\d+\.\d+)\s+", r"\1 ", text) run = p.add_run(formatted) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_section, bold=True) # ── subsection heading (第三层次) ────────────────────────── def _add_subsection(self, text: str): """条标题:四号宋体加粗,顶格 (§3.2 表3)""" cfg = self.config p = self.doc.add_paragraph() p.style = self.doc.styles["Heading 3"] p.alignment = WD_ALIGN_PARAGRAPH.LEFT _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_heading) run = p.add_run(text) _set_font(run, cfg.font_cn_heading, cfg.font_heading_en, size=cfg.size_subsection, bold=True) # ── body paragraph ────────────────────────────────────────── def _add_body_para(self, text: str, bold: bool = False, indent: bool = True): """正文:小四宋体,首行缩进2字符 (§3.2)""" cfg = self.config p = self.doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_body) if indent: _set_indent(p, cfg.first_line_indent_chars) tokens = _parse_inline(text) _add_inline(p, tokens, cfg, bold=bold) # ── code block ────────────────────────────────────────────── def _process_code(self, blk: dict): code = blk.get("raw", "") if not code.strip(): return cfg = self.config p = self.doc.add_paragraph() _set_spacing(p, before=0, after=0, line_spacing=cfg.line_spacing_code) pf = p.paragraph_format pf.left_indent = Cm(0.75) pPr = p._element.get_or_add_pPr() shd = parse_xml( f'') pPr.append(shd) for line in code.split("\n"): if line: run = p.add_run(line) _set_font(run, cfg.font_code, cfg.font_code, size=cfg.size_code) p.add_run("\n")