Skip to content

Commit a0f3782

Browse files
committed
Doc download job
1 parent 927a3dd commit a0f3782

6 files changed

+28
-10
lines changed

data_creation/batch_job.sh

+7-8
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,27 @@ mkdir -p $TARGET_DIR
77
rsync -ruC --exclude pre_computed --exclude processed_data --exclude *.pyc $SOURCE/ $TARGET_DIR
88

99
work_dir=$TARGET_DIR/
10-
log_dir=/mount/biglm_data/ELI5/logs/
11-
output_dir=/mount/biglm_data/ELI5/processed_data
10+
log_dir=/mount/biglm_data/ELI5/logs_docs/
1211
mkdir -p $log_dir
1312

14-
export WORLD_SIZE=9
13+
export WORLD_SIZE=16
1514
nodes=$(python -c "for i in range(1,$WORLD_SIZE): print(i)")
1615
node_ids=($(python -c "for i in range(0,$WORLD_SIZE): print(i)"))
17-
years=($(python -c "for i in range(2011,2011+$WORLD_SIZE): print(i)"))
16+
#years=($(python -c "for i in range(2011,2011+$WORLD_SIZE): print(i)"))
1817
#master=$(ssh -x -o LogLevel=ERROR worker-1 "echo \$hostname" )
1918
for i in $nodes; do
2019
worker=worker-${node_ids[$i]}
2120
sy=${years[$i]}
22-
kill -9 $(ps -x |grep "download_reddit_qalist.py" |grep -v grep|awk -F' ' '{print $1}')
21+
rank=$i
2322
ssh -x -o LogLevel=ERROR $worker "mkdir -p $TARGET_DIR"
2423
rsync -ruC --exclude pre_computed --exclude processed_data -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" $TARGET_DIR/ $worker:$TARGET_DIR
25-
ssh -x -o LogLevel=ERROR $worker "mkdir -p $log_dir; cd $work_dir; nohup ./download_qa.sh $sy > ${log_dir}/nohup_${i}.std 2> ${log_dir}/nohup_${i}.err &"
24+
ssh -x -o LogLevel=ERROR $worker "mkdir -p $log_dir; cd $work_dir; nohup ./download_docs.sh $rank > ${log_dir}/nohup_${i}.std 2> ${log_dir}/nohup_${i}.err &"
2625
done
2726

2827
i=0
28+
rank=$i
2929
worker=worker-${node_ids[$i]}
3030
sy=${years[$i]}
31-
kill -9 $(ps -x |grep "download_reddit_qalist.py" |grep -v grep|awk -F' ' '{print $1}')
3231
ssh -x -o LogLevel=ERROR $worker "mkdir -p $TARGET_DIR"
3332
rsync -ruC --exclude pre_computed --exclude processed_data -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" $TARGET_DIR/ $worker:$TARGET_DIR
34-
ssh -x -o LogLevel=ERROR $worker "mkdir -p $log_dir; cd $work_dir; ./download_qa.sh $sy"
33+
ssh -x -o LogLevel=ERROR $worker "mkdir -p $log_dir; cd $work_dir; ./download_docs.sh $rank"

data_creation/download_docs.sh

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
SCRIPT=$(readlink -f "$0")
3+
SCRIPT_DIR=$(dirname "$SCRIPT")
4+
cd $SCRIPT_DIR
5+
rank=$1
6+
7+
#pip install -r requirements.txt
8+
output=/mount/biglm_data/ELI5/processed_data/support_docs_$rank
9+
pre_computed=/mount/biglm_data/ELI5/pre_computed
10+
mkdir -p $output
11+
slsize=$[71520/16]
12+
13+
kill -9 $(ps -x |grep "download_support_docs.py" |grep -v grep|awk -F' ' '{print $1}')
14+
python download_support_docs.py --slnum $rank --slsize $slsize --subreddit_names '["explainlikeimfive", "AskHistorians", "askscience"]' --output_dir $output \
15+
--wet_urls ${pre_computed}/wet.paths --pre_computed_dir ${pre_computed}

data_creation/download_qa.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
SCRIPT=$(readlink -f "$0")
33
SCRIPT_DIR=$(dirname "$SCRIPT")
4-
cd SCRIPT_DIR
4+
cd $SCRIPT_DIR
55
year=$1
66

77
pip install -r requirements.txt

data_creation/download_support_docs.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ def main():
2929
help='how often are results written to file')
3030
parser.add_argument('-o', '--output_dir', default='processed_data/collected_docs', type=str,
3131
help='where to save the output')
32+
parser.add_argument('--pre_computed_dir', default='pre_computed', type=str,
33+
help='where to load pre_computed')
3234
args = parser.parse_args()
3335
# parse full list of wet urls
3436
# slice urls for WET files can be found at https://commoncrawl.org/2018/08/august-2018-crawl-archive-now-available/
@@ -43,7 +45,7 @@ def main():
4345
sr_names = json.loads(args.subreddit_names)
4446
for name in sr_names:
4547
print(name)
46-
ccrawl_ids_maps[name] = json.load(open('pre_computed/%s_ccrawl_ids.json' % (name,)))
48+
ccrawl_ids_maps[name] = json.load(open(f'{args.pre_computed_dir}/{name}_ccrawl_ids.json'))
4749
for i, (k, _) in enumerate(ccrawl_ids_maps[name]):
4850
reddit_id_group[k] = (i * 10) // len(ccrawl_ids_maps[name])
4951
# make a list of the CommonCrawl UIDs we want to process and keep

data_creation/pre_computed/AskHistorians_unigram_counts.json

+1
Large diffs are not rendered by default.

data_creation/pre_computed/askscience_unigram_counts.json

+1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)