-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path1.download.py
118 lines (92 loc) · 3.29 KB
/
1.download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# This is a basic parsing script that will generate a tree organized by
# repository, and save a concatenated version of all markdown files found
# in the repository under a nested output directory (defaults to data)
# We can then use that for further processing and analysis.
__author__ = "Vanessa Sochat"
__copyright__ = "Copyright 2022, Vanessa Sochat"
__license__ = "MPL 2.0"
from rse.main import Encyclopedia
from rse.utils.command import Command
from rse.utils.file import recursive_find, read_file, mkdir_p, write_file, write_json
import tempfile
import shutil
import argparse
import re
import sys
import os
def clone(url, dest):
dest = os.path.join(dest, os.path.basename(url))
cmd = Command("git clone %s %s" % (url, dest))
cmd.execute()
if cmd.returncode != 0:
print("Issue cloning %s" % url)
return
return dest
def get_parser():
parser = argparse.ArgumentParser(
description="Research Software Encyclopedia Analyzer",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--settings-file",
dest="settings_file",
help="custom path to settings file.",
)
parser.add_argument(
"-o",
"--outdir",
help="Output directory for data.",
default=os.path.join(os.getcwd(), "data"),
)
return parser
def main():
parser = get_parser()
args, extra = parser.parse_known_args()
# Make sure output directory exists
outdir = os.path.abspath(args.outdir)
if not os.path.exists(outdir):
sys.exit("%s does not exist!" % args.outdir)
# Create a base temporary folder to work from
tempdir = tempfile.mkdtemp()
pedia = Encyclopedia(args.settings_file)
repos = list(pedia.list())
total = len(repos)
# Keep a master lookup of topics and metadata (a lookup)
meta = {"topics": {}, "language": {}, "url": {}}
for i, reponame in enumerate(repos):
repo = pedia.get(reponame[0])
meta["topics"][repo.uid] = repo.data["data"].get("topics", [])
meta["language"][repo.uid] = repo.data["data"].get("language", "unknown")
meta["url"][repo.uid] = repo.url
datadir = os.path.join(outdir, repo.uid)
destfile = os.path.join(datadir, "CONCAT.md")
# Don't parse twice!
if os.path.exists(destfile):
continue
dest = clone(repo.url, tempdir)
if not dest:
continue
# Concat markdown, Rmarkdown, restructured syntax for the repository
text = ""
for ext in ["*.md", "*.Rmd", "*.Rd", "*.rst"]:
for md in recursive_find(dest, ext):
if re.search("LICENSE", md, re.IGNORECASE):
continue
try:
text += "".join(read_file(md))
except:
print("Issue parsing file %s" % md)
continue
# Don't include empty repos
if not text:
continue
print("Adding data for %s" % repo.uid)
mkdir_p(datadir)
write_file(destfile, text)
shutil.rmtree(dest)
shutil.rmtree(tempdir)
# Save topics, etc. to file (in docs so we don't overwhelm github pages)
write_json(meta, os.path.join("docs", "meta.json"))
if __name__ == "__main__":
main()