Skip to content

Commit 388af4b

Browse files
committed
a
1 parent 60a7404 commit 388af4b

File tree

3 files changed

+21
-3
lines changed

3 files changed

+21
-3
lines changed

get_metadata.py

-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import datasets
22
import datetime
33
import json
4-
from tqdm import tqdm
5-
import os
64
ds = datasets.load_dataset("CCRss/arXiv_dataset", split="train")
75

86
# only keep "id"

process_dataset.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import datasets
2+
import os
3+
import datetime
4+
from pathlib import Path
5+
from tqdm import tqdm
6+
import argparse
7+
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--papers", type=str, required=True)
10+
parser.add_argument("--push", type=str, required=True)
11+
args = parser.parse_args()
12+
13+
ds = datasets.load_dataset("CCRss/arXiv_dataset", split="train")
14+
15+
16+
def process(ex):
17+
return ex
18+
19+
20+
ds = ds.map(process, num_proc=os.cpu_count())

src/main.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async fn download_paper(id: &str) -> Result<Paper, Box<dyn Error>> {
8686
let response = reqwest::get(&url).await?;
8787
// check status code, if 429, wait and retry
8888
if response.status().as_u16() == 429 {
89-
println!("429: Waiting 120 seconds");
89+
println!("429: Waiting {} seconds", backoff);
9090
tokio::time::sleep(std::time::Duration::from_secs(backoff)).await;
9191
backoff *= 2;
9292
continue;

0 commit comments

Comments
 (0)