Skip to content

Commit

Permalink
Merge pull request OpenKnowledgeMaps#477 from OpenKnowledgeMaps/fix-t…
Browse files Browse the repository at this point in the history
…itlestopwords

fix stopwords in areatitles

Former-commit-id: 88ede04
  • Loading branch information
chreman authored Sep 7, 2020
2 parents d9e54da + 41749eb commit f130dbf
Show file tree
Hide file tree
Showing 27 changed files with 515 additions and 171 deletions.
28 changes: 16 additions & 12 deletions server/preprocessing/other-scripts/summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,23 @@ create_cluster_labels <- function(clusters, metadata,
for (k in seq(1, clusters$num_clusters)) {
group = c(names(clusters$groups[clusters$groups == k]))
matches = which(clusters$labels%in%group)
clusters$cluster_labels[c(matches)] = tfidf_top_names[k]
summary = tfidf_top_names[[k]]
if (summary == "") {
candidates = mapply(paste, metadata$title[matches], metadata$paper_abstract[matches])
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
candidates = mapply(paste, candidates, candidates_bigrams, candidates_trigrams)
nn_count = sort(table(strsplit(paste(candidates, collapse=" "), " ")), decreasing = T)
summary <- filter_out_nested_ngrams(names(nn_count), 3)
summary = lapply(summary, FUN = function(x) {paste(unlist(x), collapse="; ")})
summary = gsub("_", " ", summary)
summary = paste(summary, collapse="; ")
}
clusters$cluster_labels[c(matches)] = summary
}
clusters$cluster_labels <- fill_empty_areatitles(clusters$cluster_labels, metadata)
clusters$cluster_labels <- unlist(clusters$cluster_labels)
type_counts <- get_type_counts(unlowered_corpus)
clusters$cluster_labels <- fix_cluster_labels(clusters$cluster_labels, type_counts)
return(clusters)
Expand Down Expand Up @@ -180,15 +193,6 @@ fill_empty_clusters <- function(nn_tfidf, nn_corpus){
return(replacement_tfidf_top)
}

fill_empty_areatitles <- function(cluster_labels, metadata) {
missing_areatitles = which(lapply(cluster_labels, function(x) {nchar(x)}) <= 1)
replacement_areatitles = metadata$subject[missing_areatitles]
replacement_areatitles = lapply(replacement_areatitles, function(x) {gsub(";", ", ", x)})
replacement_areatitles <- lapply(replacement_areatitles, function(x) {paste0(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)))})
cluster_labels[missing_areatitles] = unlist(replacement_areatitles)
return(cluster_labels)
}

get_title_ngrams <- function(titles, stops) {
# for ngrams: we have to collapse with "_" or else tokenizers will split ngrams again at that point and we'll be left with unigrams
titles_bigrams = lapply(lapply(titles, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), 2), paste, collapse = "_"))), paste, collapse = " ")
Expand Down
76 changes: 36 additions & 40 deletions server/workers/tests/Backend regression test cases.csv
Original file line number Diff line number Diff line change
@@ -1,40 +1,36 @@
case id,data integration,affected component,search query,from,to,article types,sorting,test,failure reason,test ideas,Issue #,insights
0,PubMed,clustering,russian,1809-01-01,2017-12-04,all,most-recent,elbow heuristic succeeds,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/169,
1,BASE,clustering,chicken eggshell coloration,1665-01-01,2018-09-19,['121'],most-recent,clustering of n=2 items succeeds,,test that randomly selects two items,https://github.com/OpenKnowledgeMaps/Headstart/issues/292,
,OpenAIRE,retrieval,295562,,,,,query succeeds,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/301,
,all,retrieval,any,any,any,any,any,subject_orig field exists,,output validation schema,https://github.com/OpenKnowledgeMaps/Headstart/pull/440,
,OpenAIRE,retrieval,any,any,any,any,any,at least one document with more than one author,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/250,
,all,retrieval,any,any,any,any,any,presence of core fields in input_data,,output validation schema,https://github.com/OpenKnowledgeMaps/Headstart/issues/265,
2,PubMed,summarization,edelstahl,1809-01-01,2018-10-05,all,most-recent,no empty bubble titles,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/303,
,OpenAIRE,clustering,EC/226716,,,,,clustering succeeds for n>500 items,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/246,
,OpenAIRE,clustering,EC/604102,,,,,clustering succeeds for n>500 items,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/246,
,OpenAIRE,enrichment,EC/647557,,,,,enrichment succeds for malformed metadata,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/244,
,OpenAIRE,,,,,,,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/261,
,BASE,retrieval,boolean queries,,,,,success for all boolean queries,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/198,
,all,retrieval,queries with Umlaute,,,,,success for all queries,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/285,
,PubMed,retrieval,any,,,,,success for many article types,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/197,
,all,summarization,,,,,,no non-unique bubble titles,,,https://github.com/OpenKnowledgeMaps/Headstart/pull/377,
,BASE,retrieval,,,,,,mixed queries succeed,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/267,
,OpenAire,retrieval,any,,,,,all test queries succed,,,https://trello.com/c/gK4bDCHQ/39-creating-new-maps-in-viper-fails,
3,BASE,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max 15 unique area titles for n <= 100 items,,advanced output validation,https://trello.com/c/050TnPLY/61-severe-clustering-fail,
4,BASE,summarization,"""black lives matter""",1665-01-01,2020-06-08,['121'],most-relevant,no empty bubble titles,,output validation schema,https://trello.com/c/yL7LgQ08/55-empty-bubble-titles-appear-again,
5,BASE,summarization,"""systemic racism""",1665-01-01,2020-06-16,['121'],most-relevant,no stopwords at start/end of keywords in bubble titles,,advanced output validation,https://trello.com/c/2Bwe9W1w/56-stop-word-prevention-in-bubble-titles,seems to be fixed after locale update (??)
,all,layout,,,,,,similar layout for same/similar input,,create previews with 1-n papers randomly removed; or compare x-y differences by summing up,https://trello.com/c/Vlg4xMoe/46-issues-with-reproducible-rotation,
6,base,clustering,heart disease,2019-01-06,2020-08-06,['121'],most-relevant,clustering 2 items,1 == 2,,,
7,base,clustering,studi kelayakan proyek,2017-12-25,2020-08-06,['121'],most-relevant,max n cluster,16 <= 15,,,
8,base,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max n cluster,17 <= 15,,,
9,base,clustering,stuff,2018-06-28,2020-08-06,['121'],most-relevant,max n cluster,20 <= 15,,,
10,base,clustering,corona,2020-03-31,2020-08-06,['121'],most-relevant,max n cluster,16 <= 15,,,
11,base,clustering,leadership,2019-01-18,2020-08-06,['121'],most-recent,max n cluster,25 <= 15,,,
12,base,clustering,stuff,2018-10-28,2020-08-06,['121'],most-relevant,max n cluster,16 <= 15,,,
13,pubmed,metadata_schema,russian,1809-01-01,2017-12-04,,most-recent,knowledgemap schema,None The column relation exists in the schema but not in the data frame,,,
14,base,metadata_schema,ijarah,2020-07-20,2020-08-06,['121'],most-recent,knowledgemap schema,None The column x exists in the schema but not in the data frame,,,
15,pubmed,metadata_schema,edelstahl,1809-01-01,2018-10-05,,most-recent,knowledgemap schema,None The column relation exists in the schema but not in the data frame,,,
16,base,metadata_schema,pembelajaran dimasa pandemi,2018-11-17,2020-08-06,['121'],most-recent,knowledgemap schema,None The column x exists in the schema but not in the data frame,,,
17,base,summarization,ijarah,2020-07-20,2020-08-06,['121'],most-recent,empty area titles,1 == 0,,,
18,base,summarization,virus corona,2020-01-29,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"being' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
19,base,summarization,stuff,1665-01-01,2020-07-01,['121'],most-relevant,stopwords not start end keywords areatitles,"many' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
20,base,summarization,pembelajaran di masa pandemi,2020-02-26,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"new' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
21,base,summarization,stuff,2018-06-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"2' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
22,base,summarization,corona,2020-03-31,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"being' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
23,base,summarization,stuff,2018-10-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"just' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,
case id,data integration,affected component,search query,from,to,article types,sorting,test,failure reason,status,test ideas,Issue #,insights
0,PubMed,clustering,russian,1809-01-01,2017-12-04,all,most-recent,elbow heuristic succeeds,,fixed,,https://github.com/OpenKnowledgeMaps/Headstart/issues/169,
1,BASE,clustering,chicken eggshell coloration,1665-01-01,2018-09-19,['121'],most-recent,clustering of n=2 items succeeds,,fixed,test that randomly selects two items,https://github.com/OpenKnowledgeMaps/Headstart/issues/292,
,OpenAIRE,retrieval,295562,,,,,query succeeds,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/301,
,all,retrieval,any,any,any,any,any,subject_orig field exists,,,output validation schema,https://github.com/OpenKnowledgeMaps/Headstart/pull/440,
,OpenAIRE,retrieval,any,any,any,any,any,at least one document with more than one author,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/250,
,all,retrieval,any,any,any,any,any,presence of core fields in input_data,,,output validation schema,https://github.com/OpenKnowledgeMaps/Headstart/issues/265,
2,PubMed,summarization,edelstahl,1809-01-01,2018-10-05,all,most-recent,no empty bubble titles,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/303,
,OpenAIRE,clustering,EC/226716,,,,,clustering succeeds for n>500 items,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/246,
,OpenAIRE,clustering,EC/604102,,,,,clustering succeeds for n>500 items,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/246,
,OpenAIRE,enrichment,EC/647557,,,,,enrichment succeds for malformed metadata,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/244,
,OpenAIRE,,,,,,,,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/261,
,BASE,retrieval,boolean queries,,,,,success for all boolean queries,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/198,
,all,retrieval,queries with Umlaute,,,,,success for all queries,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/285,
,PubMed,retrieval,any,,,,,success for many article types,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/197,
,all,summarization,,,,,,no non-unique bubble titles,,,,https://github.com/OpenKnowledgeMaps/Headstart/pull/377,
,BASE,retrieval,,,,,,mixed queries succeed,,,,https://github.com/OpenKnowledgeMaps/Headstart/issues/267,
,OpenAire,retrieval,any,,,,,all test queries succed,,,,https://trello.com/c/gK4bDCHQ/39-creating-new-maps-in-viper-fails,
3,BASE,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max 15 unique area titles for n <= 100 items,,,advanced output validation,https://trello.com/c/050TnPLY/61-severe-clustering-fail,
4,BASE,summarization,"""black lives matter""",1665-01-01,2020-06-08,['121'],most-relevant,no empty bubble titles,,,output validation schema,https://trello.com/c/yL7LgQ08/55-empty-bubble-titles-appear-again,
5,BASE,summarization,"""systemic racism""",1665-01-01,2020-06-16,['121'],most-relevant,no stopwords at start/end of keywords in bubble titles,,,advanced output validation,https://trello.com/c/2Bwe9W1w/56-stop-word-prevention-in-bubble-titles,seems to be fixed after locale update (??)
,all,layout,,,,,,similar layout for same/similar input,,,create previews with 1-n papers randomly removed; or compare x-y differences by summing up,https://trello.com/c/Vlg4xMoe/46-issues-with-reproducible-rotation,
6,base,clustering,heart disease,2019-01-06,2020-08-06,['121'],most-relevant,clustering 2 items,1 == 2,,,,
7,base,clustering,corona,2020-03-31,2020-08-06,['121'],most-relevant,max n cluster,16 <= 15,,,,
8,base,summarization,corona,2020-03-31,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"being' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
9,base,clustering,digital marketing,2018-07-18,2020-08-06,['121'],most-recent,max n cluster,17 <= 15,,,,
10,pubmed,metadata_schema,edelstahl,1809-01-01,2018-10-05,,most-recent,knowledgemap schema,None The column relation exists in the schema but not in the data frame,,,,
11,base,clustering,happiness,2020-01-31,2020-08-06,['121'],most-relevant,max n cluster,21 <= 15,,,,
12,base,clustering,pariwisata,2018-08-27,2020-08-06,['121'],most-relevant,clustering 2 items,1 == 2,,,,
13,base,summarization,pembelajaran di masa pandemi,2020-02-26,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"new' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
14,pubmed,metadata_schema,russian,1809-01-01,2017-12-04,,most-recent,knowledgemap schema,None The column relation exists in the schema but not in the data frame,,,,
15,base,clustering,social attention,2018-08-17,2020-08-06,['121'],most-relevant,clustering 2 items,1 == 2,,,,
16,base,clustering,studi kelayakan proyek,2017-12-25,2020-08-06,['121'],most-relevant,max n cluster,16 <= 15,,,,
17,base,clustering,stuff,1665-01-01,2020-07-01,['121'],most-relevant,max n cluster,17 <= 15,,,,
18,base,clustering,stuff,2018-06-28,2020-08-06,['121'],most-relevant,max n cluster,17 <= 15,,,,
19,base,summarization,stuff,2018-10-28,2020-08-06,['121'],most-relevant,stopwords not start end keywords areatitles,"just' not in {'0', '1', '2', 'a', ""a's"", 'able', ...}",,,,
14 changes: 14 additions & 0 deletions server/workers/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pytest

RANDOM = None


def pytest_addoption(parser):
parser.addoption(
"--random", action="store_true", default=False, help="run randomized tests"
)


def pytest_configure(config):
global RANDOM
RANDOM = config.option.random
32 changes: 16 additions & 16 deletions server/workers/tests/knowncases/testcase0.json

Large diffs are not rendered by default.

Loading

0 comments on commit f130dbf

Please sign in to comment.