-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathMakefile
60 lines (45 loc) · 2.87 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# consistent formatting
BIBCLEAN = bibclean -max-width 120 -align-equals -no-fix-names
# add a consistent citation key
BIBTOOLKEY = bibtool -f 'cc:%4p(author):%4d(year):%4T(title)'
BIBSRC = $(sort $(wildcard bib/cc*.bib))
all: html/commoncrawl.html
tmp/commoncrawl.bib: $(BIBSRC)
mkdir -p tmp
$(BIBCLEAN) $(BIBSRC) >tmp/commoncrawl.bib
# HTML export
html/commoncrawl.html: tmp/commoncrawl.bib
mkdir -p html; cd html; bibtex2html --charset utf-8 ../tmp/commoncrawl.bib
# CSV export for Hugging Face 🤗
tmp/commoncrawl_annotated.csv: tmp/commoncrawl.bib
python3 export-csv.py $< >$@
# json annotated export for Hugging Face 🤗
# depends on gscholar_alerts/citations.jsonl but most people do not have the /eml/ subdirectory
tmp/commoncrawl_counts.csv:
mkdir -p tmp; cd tmp; python ../split-jsonl.py ../gscholar_alerts/citations.jsonl
# format .bib file
%.formatted.bib: %.bib
$(BIBCLEAN) $< >$@
# prepare: add CC annotations and ID
%.prepared.bib: %.bib
perl -000 -lne '$$url=" url = {},\n"; $$url = "" if /\surl\s*=/; s@([}"]?),?\n\}$$@$$1,\n$$url cc-author-affiliation = {},\n cc-class = {},\n}@; print' $< | $(BIBTOOLKEY) | perl -lpe 'do { s@\.ea:20@EtAl:20@; s@\.@@g } if /^@/' >$@
# some statistics about the citations
cc-annotations:
perl -lne '$$h{$$1}++ if /^\s*(cc(?:-[a-z_0-9]+)+)\s*=/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
cc-classes:
perl -lne 'if (s/^\s*cc-class\s*=\s*["{]// .. s/["}],?$$//) { $$classes .= $$_ } elsif (defined $$classes) { do { s@\s+@ @g; $$h{$$_}++ } for split /,\s*/, $$classes; $$classes = undef; }; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
cc-author-affiliations:
perl -lne 'if (s/^\s*cc-author-affiliation\s*=\s*["{]// .. s/["}],?$$//) { $$classes .= $$_ } elsif (defined $$classes) { do { s@\s+@ @g; $$h{$$_}++ } for split /;\s*/, $$classes; $$classes = undef; }; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
cc-derived-datasets:
perl -lne 'if (s/^\s*cc-derived-dataset-(?:used|cited|about)\s*=\s*["{]// .. s/["}],?$$//) { $$datasets .= $$_ . ", " } elsif (defined $$datasets) { $$h{$$_}++ for split /,\s*/, $$datasets; $$datasets = undef; } END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
count:
grep -c '^@' bib/*.bib | perl -aF':' -lne 'print join("\t", $$F[1], $$F[0], @F[2..$$#F])' | sort -k2,2
bibtex-fields:
perl -lne '$$h{$$1}++ if /^\s*([A-Za-z_0-9-]+)\s*=\s*["{]/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
clean:
rm bib/*.formatted.bib
# Google Scholar Alerts
gscholar_alerts/extracted_citations.jsonl: gscholar_alerts/eml/
python3 gscholar_alerts/parse_scholar_alert_eml.py $< | LC_ALL=C sort >$@
gscholar_alerts/citations.jsonl: gscholar_alerts/extracted_citations.jsonl
jq -c 'select(.title != null and .authors != null) | del(.idx, .date, .data, .ref, .link)' $< >$@