2025/08/08

Python | 調整 dropbox中 obsidian & logseq共用journal folder中 檔名規則出入

 #!/usr/bin/env python3

# -*- coding: utf-8 -*-


import re

import sys

import argparse

from pathlib import Path

from datetime import date


# ===== 工具:月份與序數 =====

MONTHS_FULL = [

    "January","February","March","April","May","June",

    "July","August","September","October","November","December"

]

MONTHS_ABBR = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

MONTH_TO_NUM = {m:i+1 for i,m in enumerate(MONTHS_FULL)}

ABBR_TO_NUM  = {m:i+1 for i,m in enumerate(MONTHS_ABBR)}

NUM_TO_MONTH = {i+1:m for i,m in enumerate(MONTHS_FULL)}


def ordinal(n: int) -> str:

    if 11 <= (n % 100) <= 13:

        suffix = "th"

    else:

        suffix = {1:"st",2:"nd",3:"rd"}.get(n % 10, "th")

    return f"{n}{suffix}"


def canonical_stem(d: date) -> str:

    # 目標:MMMM do, yyyy(注意 do 小寫)

    return f"{NUM_TO_MONTH[d.month]} {ordinal(d.day)}, {d.year}"


# ===== 解析各種日期字串(給 [[...]] 正規化用) =====

def parse_date_token(s: str):

    s = s.strip()


    m = re.match(rf"^\s*({'|'.join(MONTHS_FULL)})\s+(\d{{1,2}})(?:st|nd|rd|th)?,\s*(\d{{4}})\s*$", s, flags=re.IGNORECASE)

    if m:

        month = MONTH_TO_NUM[m.group(1).capitalize()]

        day = int(m.group(2)); year = int(m.group(3))

        try: return date(year, month, day)

        except ValueError: return None


    m = re.match(rf"^\s*({'|'.join(MONTHS_ABBR)})\.?\s+(\d{{1,2}})(?:st|nd|rd|th)?,\s*(\d{{4}})\s*$", s, flags=re.IGNORECASE)

    if m:

        month = ABBR_TO_NUM[m.group(1).capitalize()]

        day = int(m.group(2)); year = int(m.group(3))

        try: return date(year, month, day)

        except ValueError: return None


    m = re.match(r"^\s*(\d{4})[._/\-](\d{1,2})[._/\-](\d{1,2})\s*$", s)

    if m:

        y, mo, d = map(int, m.groups())

        try: return date(y, mo, d)

        except ValueError: return None


    return None


# ===== 連結正規化 [[日期]] → [[MMMM do, yyyy]] =====

REF_PATTERN = re.compile(r"\[\[\s*([^\]\|#]+?)\s*\]\]")


def rewrite_date_page_refs(text: str):

    replaced = 0

    def _repl(m):

        nonlocal replaced

        inner = m.group(1)

        d = parse_date_token(inner)

        if d:

            replaced += 1

            return f"[[{canonical_stem(d)}]]"

        return m.group(0)

    new_text = REF_PATTERN.sub(_repl, text)

    return new_text, replaced


# ===== 安全重新命名(加 merged 前綴時避免撞名) =====

def prefixed_unique_path(p: Path, prefix="merged ") -> Path:

    target = p.with_name(prefix + p.name)

    if not target.exists():

        return target

    stem, suf = target.stem, target.suffix

    i = 1

    while True:

        c = target.with_name(f"{stem} ({i}){suf}")

        if not c.exists():

            return c

        i += 1


# ===== 解析檔名取得日期 =====

def parse_date_from_stem(stem: str):

    s = stem.strip()


    m = re.match(rf"^({'|'.join(MONTHS_FULL)})\s+(\d{{1,2}})(?:st|nd|rd|th)?,\s*(\d{{4}})$", s)

    if m:

        month = MONTH_TO_NUM[m.group(1)]

        day = int(m.group(2)); year = int(m.group(3))

        try:

            d = date(year, month, day)

            style = "canonical" if re.search(r"(st|nd|rd|th),", s) else "long_month"

            return d, style

        except ValueError:

            return None


    m = re.match(rf"^({'|'.join(MONTHS_ABBR)})\.?\s+(\d{{1,2}})(?:st|nd|rd|th)?,\s*(\d{{4}})$", s, flags=re.IGNORECASE)

    if m:

        month = ABBR_TO_NUM[m.group(1).capitalize()]

        day = int(m.group(2)); year = int(m.group(3))

        try: return date(year, month, day), "abbr_month"

        except ValueError: return None


    m = re.match(r"^(\d{4})[._-](\d{1,2})[._-](\d{1,2})$", s)

    if m:

        y, mo, d = map(int, m.groups())

        try: return date(y, mo, d), "iso"

        except ValueError: return None


    return None


# ===== 檔案讀寫 =====

def read_text(path: Path) -> str:

    return path.read_text(encoding="utf-8", errors="ignore")


def write_text(path: Path, text: str):

    path.write_text(text, encoding="utf-8")


def append_transformed(src: Path, dest: Path) -> tuple[int,int]:

    raw = read_text(src)

    fixed, replaced = rewrite_date_page_refs(raw)

    with dest.open("a", encoding="utf-8") as f:

        if dest.exists() and dest.stat().st_size > 0:

            f.write("\n")

        f.write(fixed)

        if not fixed.endswith("\n"):

            f.write("\n")

    return len(fixed), replaced


def rewrite_links_in_file(path: Path) -> int:

    raw = read_text(path)

    fixed, replaced = rewrite_date_page_refs(raw)

    if replaced > 0:

        write_text(path, fixed)

    return replaced


# ===== 掃描候選 =====

def find_candidates(root: Path):

    """

    掃描 root 下的 .md(不遞迴,跳過 'merged ' 開頭)

    回傳 { date: { 'canonical': Path|None, 'sources': [Path,...] } }

    """

    mapping = {}

    for p in root.glob("*.md"):

        if p.name.lower().startswith("merged "):

            continue

        parsed = parse_date_from_stem(p.stem)

        if not parsed:

            continue

        d, style = parsed

        bucket = mapping.setdefault(d, {"canonical": None, "sources": []})

        if style == "canonical":

            bucket["canonical"] = p

        else:

            bucket["sources"].append(p)

    # 保留所有有關聯的日期(包含只有 canonical 的,稍後用邏輯過濾「不需處理」的)

    return mapping


def process_dates(root: Path, mapping: dict, limit: int, dry_run: bool):

    if not mapping:

        print("沒有可處理的日期檔。")

        return


    dates_sorted = sorted(mapping.keys(), reverse=True)


    # 僅選擇「需要處理」的日期:

    # - 有 sources(需要合併),或

    # - 沒有 canonical(只有 1 個或多個來源 → 需要改名或合併)

    # 會跳過:已是 canonical 且沒有 sources(代表已正確,屬 no-op)

    work_dates = [d for d in dates_sorted if not (mapping[d]["canonical"] and not mapping[d]["sources"])]


    if not work_dates:

        print("找不到需要處理的日期(最新的都已是 MMMM do, yyyy 且無重複)。")

        return


    selected = work_dates[:limit]


    print("日期狀態(由新到舊):")

    for d in dates_sorted:

        info = mapping[d]

        if info["canonical"] and not info["sources"]:

            status = "OK 已是 MMMM do, yyyy(跳過)"

        elif info["canonical"] and info["sources"]:

            status = f"合併 {len(info['sources'])} 來源"

        elif not info["canonical"] and len(info["sources"]) == 1:

            status = "改名為 MMMM do, yyyy"

        else:

            status = f"合併 {len(info['sources'])} 來源(並建立目標)"

        tag = " ← 將處理" if d in selected else ""

        print(f"- {d.isoformat()}:{status}{tag}")


    for d in selected:

        info = mapping[d]

        target = info["canonical"] if info["canonical"] else root / f"{canonical_stem(d)}.md"

        will_create = (info["canonical"] is None) and (not target.exists())

        sources_sorted = sorted(info["sources"], key=lambda p: p.name.lower())


        print(f"\n=== {d.isoformat()} ===")


        # 單一來源且尚未存在目標 → 直接改名(不加 merged)

        if info["canonical"] is None and len(sources_sorted) == 1 and will_create:

            src = sources_sorted[0]

            print(f"[SINGLE] 僅有來源:{src.name}")

            if dry_run:

                print(f"  [REWRITE] 將在檔內正規化 [[日期]] → [[{canonical_stem(d)}]]")

                print(f"  [RENAME]  '{src.name}'  →  '{target.name}'")

            else:

                replaced = rewrite_links_in_file(src)

                if replaced:

                    print(f"  [REWRITE] 已正規化 {replaced} 處 [[日期]]")

                src.rename(target)

                print(f"  [RENAMED] '{src.name}'  →  '{target.name}'")

                # 保險:對新檔再跑一次正規化

                replaced_t = rewrite_links_in_file(target)

                if replaced_t:

                    print(f"  [TARGET REWRITE] {target.name} 內正規化 {replaced_t} 處 [[日期]]")

            continue


        # 其他情境:存在目標或有多來源 → 合併 + 來源加 merged 前綴

        if will_create:

            print(f"[TARGET] {target.name}(將建立)")

        else:

            print(f"[TARGET] {target.name}")


        planned = []

        for src in sources_sorted:

            new_path = prefixed_unique_path(src)  # merged ...

            planned.append((src, new_path))


        if dry_run:

            print(f"  [TARGET REWRITE] 將正規化 {target.name} 內的 [[日期]](若存在)")

            for src, new_path in planned:

                print(f"  [COPY]   '{src.name}'  →  '{target.name}'(先正規化連結)")

                print(f"  [RENAME] '{src.name}'  →  '{new_path.name}'")

            continue


        if will_create:

            target.write_text("", encoding="utf-8")

            print(f"[CREATE TARGET] 已建立:{target.name}")


        replaced_t = rewrite_links_in_file(target)

        if replaced_t:

            print(f"  [TARGET REWRITE] {target.name} 內正規化 {replaced_t} 處 [[日期]]")


        total_chars = 0

        total_refs  = 0

        for src, new_path in planned:

            appended_len, replaced_refs = append_transformed(src, target)

            total_chars += appended_len

            total_refs  += replaced_refs

            print(f"  [COPIED]  '{src.name}'  →  '{target.name}'  (+{appended_len} chars, fixed {replaced_refs} refs)")

            src.rename(new_path)

            print(f"  [RENAMED] '{src.name}'  →  '{new_path.name}'")


        print(f"[SUMMARY] 合併 {len(planned)} 個來源,總附加 ~{total_chars} 字元、正規化 {total_refs} 個 [[日期]] → {target.name}")


    print(f"\n完成!此次處理日期數量:{len(selected)}。")


def main():

    ap = argparse.ArgumentParser(description="合併/正規化日記到 MMMM do, yyyy;跳過已是正確格式且無重複的最新日期。")

    ap.add_argument("root", nargs="?", default=".", help="要處理的資料夾(預設:目前資料夾)")

    ap.add_argument("-n","--limit", type=int, default=1000, help="一次處理的『日期數量』(預設:1;可改 10 或 1000)")

    ap.add_argument("--dry-run", action="store_true", help="僅列出將進行的動作,不修改檔案")

    args = ap.parse_args()


    root = Path(args.root).expanduser().resolve()

    if not root.exists():

        print(f"找不到資料夾:{root}")

        sys.exit(1)


    mapping = find_candidates(root)

    process_dates(root, mapping, limit=args.limit, dry_run=args.dry_run)


if __name__ == "__main__":

    main()


沒有留言:

張貼留言