-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathauto_pdf_translate.py
More file actions
executable file
·180 lines (144 loc) · 4.85 KB
/
auto_pdf_translate.py
File metadata and controls
executable file
·180 lines (144 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
"""Auto-translate PDFs in ~/Downloads using beko-translate-pdf.
Env overrides:
- AUTO_PDF_TRANSLATE_INPUT_DIR
- AUTO_PDF_TRANSLATE_OUTPUT_DIR
- AUTO_PDF_TRANSLATE_MODEL
Defaults:
- Scans ~/Downloads (non-recursive).
- Outputs to ~/Desktop/pdf_translated.
- Uses --model plamo (override with --model).
- Skips PDFs that already have a corresponding .ja.pdf in the output dir.
- Use --input-dir to scan a different directory.
- Use --arxiv to target arXiv-like filenames only.
- Use --days to limit to PDFs modified within the last N days.
(Only applied when --days is specified.)
"""
from __future__ import annotations
import argparse
import os
import re
import subprocess
import sys
import time
from pathlib import Path
from typing import Iterable, List
DEFAULT_INPUT_DIR = Path.home() / "Downloads"
DEFAULT_OUTPUT_DIR = Path.home() / "Desktop" / "pdf_translated"
DEFAULT_MODEL = "plamo"
ENV_INPUT_DIR = "AUTO_PDF_TRANSLATE_INPUT_DIR"
ENV_OUTPUT_DIR = "AUTO_PDF_TRANSLATE_OUTPUT_DIR"
ENV_MODEL = "AUTO_PDF_TRANSLATE_MODEL"
def env_or_default_path(name: str, default: Path) -> Path:
value = os.environ.get(name)
if not value:
return default
return Path(value).expanduser()
def env_or_default_str(name: str, default: str) -> str:
value = os.environ.get(name)
return value if value else default
DEFAULT_INPUT_DIR = env_or_default_path(ENV_INPUT_DIR, DEFAULT_INPUT_DIR)
DEFAULT_OUTPUT_DIR = env_or_default_path(ENV_OUTPUT_DIR, DEFAULT_OUTPUT_DIR)
DEFAULT_MODEL = env_or_default_str(ENV_MODEL, DEFAULT_MODEL)
ARXIV_PATTERN = re.compile(r"^\d{4}\.\d{4,5}v\d+( \(\d+\))?\.pdf$")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Auto-translate PDFs in ~/Downloads using beko-translate-pdf."
)
parser.add_argument(
"-a",
"--arxiv",
action="store_true",
help="Only include PDFs with arXiv-like filenames.",
)
parser.add_argument(
"--days",
nargs="?",
const=3,
default=None,
type=float,
help=(
"Limit to PDFs modified within the last N days. "
"If provided without a value, defaults to 3. "
"No date filtering is applied unless --days is specified."
),
)
parser.add_argument(
"-i",
"--input-dir",
type=Path,
default=DEFAULT_INPUT_DIR,
help="Directory to scan for PDFs.",
)
parser.add_argument(
"-o",
"--output-dir",
type=Path,
default=DEFAULT_OUTPUT_DIR,
help="Output directory for translated PDFs.",
)
parser.add_argument(
"--model",
default=DEFAULT_MODEL,
help=f"Translation model to use (default: {DEFAULT_MODEL}).",
)
return parser.parse_args()
def find_pdfs(directory: Path) -> Iterable[Path]:
for path in sorted(directory.glob("*.pdf")):
yield path
def filter_arxiv(paths: Iterable[Path]) -> List[Path]:
return [path for path in paths if ARXIV_PATTERN.match(path.name)]
def filter_days(paths: Iterable[Path], days: float) -> List[Path]:
cutoff = time.time() - (days * 24 * 60 * 60)
result = []
for path in paths:
try:
if path.stat().st_mtime >= cutoff:
result.append(path)
except FileNotFoundError:
continue
return result
def has_translated_output(pdf_path: Path, output_dir: Path) -> bool:
"""Return True if a corresponding .ja.pdf exists in output_dir."""
translated_name = f"{pdf_path.stem}.ja.pdf"
return (output_dir / translated_name).exists()
def validate_args(args: argparse.Namespace) -> int:
if not args.input_dir.is_dir():
sys.stderr.write(f"Input directory not found: {args.input_dir}\n")
return 1
if args.days is not None and args.days <= 0:
sys.stderr.write("--days must be a positive number.\n")
return 1
return 0
def main() -> int:
args = parse_args()
validation_error = validate_args(args)
if validation_error:
return validation_error
paths = list(find_pdfs(args.input_dir))
if args.arxiv:
paths = filter_arxiv(paths)
if args.days is not None:
paths = filter_days(paths, args.days)
output_dir = args.output_dir
output_dir.mkdir(parents=True, exist_ok=True)
if not paths:
print("No PDFs found to translate.")
return 0
targets = [path for path in paths if not has_translated_output(path, output_dir)]
if not targets:
print("All PDFs already have translated outputs.")
return 0
cmd = [
"beko-translate-pdf",
*[str(p) for p in targets],
"--output-dir",
str(output_dir),
"--model",
str(args.model),
]
print("[run]", " ".join(cmd))
result = subprocess.run(cmd)
return result.returncode
if __name__ == "__main__":
sys.exit(main())