Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Sample Datatool #41

Merged
merged 14 commits into from
Apr 8, 2024
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ LAB](https://github.com/asreview/asreview) that can be used to:
- [**Deduplicate**](#data-dedup) data
- [**Stack**](#data-vstack-experimental) multiple datasets
- [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets
- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations.
- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations
- [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time.

Several [tutorials](Tutorials.md) are available that show how
`ASReview-Datatools` can be used in different scenarios.
Expand Down Expand Up @@ -282,6 +283,15 @@ One thing to note is that OpenAlex will handle data requests faster if the sende
asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com
```

## Sample

This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be:

```bash
asreview data sample input_dataset.xlsx output_dataset.xslx 50
```
This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`.

## License

This extension is published under the [MIT license](/LICENSE).
Expand Down
8 changes: 7 additions & 1 deletion asreviewcontrib/datatools/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
from asreviewcontrib.datatools.convert import convert
from asreviewcontrib.datatools.describe import _parse_arguments_describe
from asreviewcontrib.datatools.describe import describe
from asreviewcontrib.datatools.sample import _parse_arguments_sample
from asreviewcontrib.datatools.sample import sample
from asreviewcontrib.datatools.snowball import _parse_arguments_snowball
from asreviewcontrib.datatools.snowball import snowball
from asreviewcontrib.datatools.stack import _parse_arguments_vstack
from asreviewcontrib.datatools.stack import vstack

DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball"]
DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"]


class DataEntryPoint(BaseEntryPoint):
Expand Down Expand Up @@ -104,6 +106,10 @@ def execute(self, argv):
args_snowballing_parser = _parse_arguments_snowball()
args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:]))
snowball(**args_snowballing)
if argv[0] == "sample":
args_sample_parser = _parse_arguments_sample()
args_sample = vars(args_sample_parser.parse_args(argv[1:]))
sample(**args_sample)
if argv[0] == "vstack":
args_vstack_parser = _parse_arguments_vstack()
args_vstack = args_vstack_parser.parse_args(argv[1:])
Expand Down
70 changes: 70 additions & 0 deletions asreviewcontrib/datatools/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import argparse

import pandas as pd
from asreview import ASReviewData
from asreview.data.base import load_data


def sample(input_path, output_path, nr_records, year_column="publication_year"):
df_input = load_data(input_path).df

# Check for presence of any variation of a year column
if year_column not in df_input.columns:
raise ValueError(f"• The input file should have a {year_column} column.")

# Check if k is not too large
if nr_records * 3 > len(df_input):
raise ValueError(
f"• The number of records to sample is too large."
f"Only {len(df_input)} records are present in the input file."
f" You are trying to sample {nr_records*3} records."
)

if nr_records < 1:
raise ValueError("• The number of records to sample should be at least 1.")

# Sort by year
dated_records = df_input[df_input[year_column].notnull()]

if dated_records.empty:
raise ValueError(f"• The input file has no {year_column} values.")

if len(dated_records) < nr_records * 2:
raise ValueError("• Not enough dated records to sample from.")

sorted_records = dated_records.sort_values(year_column, ascending=True)

# Take k old and k new records
old_records = sorted_records.head(nr_records)
new_records = sorted_records.tail(nr_records)

# Sample k records without overlap with old/new records
records_to_exclude = pd.concat([old_records, new_records]).index
remaining_records = df_input[~df_input.index.isin(records_to_exclude)]

sampled_records = remaining_records.sample(nr_records)

# Combine old, new, and sampled records
df_out = pd.concat([old_records, sampled_records, new_records])

asdata = ASReviewData(df=df_out)
asdata.to_file(output_path)


def _parse_arguments_sample():
parser = argparse.ArgumentParser(prog="asreview data sample")
parser.add_argument("input_path", type=str, help="The input file path.")
parser.add_argument("output_path", type=str, help="The output file path.")
parser.add_argument(
"nr_records",
type=int,
help="The amount of records for old, random, and new records each.",
)
parser.add_argument(
"--year_column",
default="publication_year",
type=str,
help="The name of the column containing the publication year.",
)

return parser
7 changes: 7 additions & 0 deletions tests/demo_data/sample_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
title, doi, publication_year
title1, doi1, 2005
title2, doi2, 2001
title3, doi3,
title4, doi4, 2003
title5, doi5, 2004
title6, doi6, 2000
17 changes: 17 additions & 0 deletions tests/test_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# create unit tests for the sample.py file
from pathlib import Path

import pandas as pd

from asreviewcontrib.datatools.sample import sample

INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv"


def test_sample(tmpdir):
sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year")
df = pd.read_csv(tmpdir / "output.csv")
assert len(df) == 3
assert "publication_year" in df.columns
assert df.iloc[0]["publication_year"] == 2000
assert df.iloc[2]["publication_year"] == 2005
Loading