Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data attestation as a requirement #481

Merged
merged 8 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions everyvoice/.schema/everyvoice-aligner-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,12 @@
"title": "Label",
"type": "string"
},
"permissions_obtained": {
"default": false,
"description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
"title": "Permissions Obtained",
"type": "boolean"
},
"data_dir": {
"default": "/please/create/a/path/to/your/dataset/data",
"description": "The path to the directory with your audio files.",
Expand Down
6 changes: 6 additions & 0 deletions everyvoice/.schema/everyvoice-shared-data-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@
"title": "Label",
"type": "string"
},
"permissions_obtained": {
"default": false,
"description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
"title": "Permissions Obtained",
"type": "boolean"
},
"data_dir": {
"default": "/please/create/a/path/to/your/dataset/data",
"description": "The path to the directory with your audio files.",
Expand Down
6 changes: 6 additions & 0 deletions everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@
"title": "Label",
"type": "string"
},
"permissions_obtained": {
"default": false,
"description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
"title": "Permissions Obtained",
"type": "boolean"
},
"data_dir": {
"default": "/please/create/a/path/to/your/dataset/data",
"description": "The path to the directory with your audio files.",
Expand Down
6 changes: 6 additions & 0 deletions everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@
"title": "Label",
"type": "string"
},
"permissions_obtained": {
"default": false,
"description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
"title": "Permissions Obtained",
"type": "boolean"
},
"data_dir": {
"default": "/please/create/a/path/to/your/dataset/data",
"description": "The path to the directory with your audio files.",
Expand Down
6 changes: 6 additions & 0 deletions everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,12 @@
"title": "Label",
"type": "string"
},
"permissions_obtained": {
"default": false,
"description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
"title": "Permissions Obtained",
"type": "boolean"
},
"data_dir": {
"default": "/please/create/a/path/to/your/dataset/data",
"description": "The path to the directory with your audio files.",
Expand Down
19 changes: 17 additions & 2 deletions everyvoice/config/preprocessing_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Annotated, List, Optional, Union

from annotated_types import Ge, Le
from pydantic import Field, FilePath, ValidationInfo, model_validator
from pydantic import Field, FilePath, ValidationInfo, field_validator, model_validator

from everyvoice.config.shared_types import ConfigModel, PartialLoadConfig, init_context
from everyvoice.config.utils import (
Expand Down Expand Up @@ -96,6 +96,11 @@ class AudioConfig(ConfigModel):

class Dataset(PartialLoadConfig):
label: str = Field("YourDataSet", description="A label for the source of data")
permissions_obtained: bool = Field(
False,
description="An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
validate_default=True,
)
data_dir: PossiblyRelativePath = Field(
Path("/please/create/a/path/to/your/dataset/data"),
description="The path to the directory with your audio files.",
Expand All @@ -113,6 +118,14 @@ class Dataset(PartialLoadConfig):
description="Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.",
)

@field_validator("permissions_obtained")
def check_permissions(cls, permissions_obtained: bool) -> bool:
if not permissions_obtained:
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage says this raise statement is not exercised by unit testing. It should be easy to add one test case where you create a dataset without specifying permission=True, maybe just duplicating the test case you had to fix with permissions_obtained=True, but without that, it would exercise this.

"You are trying to run a model that does not have permission for the data it is using. Please confirm you have permission to use this data and edit your configuration file accordingly."
)
return permissions_obtained


class PreprocessingConfig(PartialLoadConfig):
dataset: str = Field("YourDataSet", description="The name of the dataset.")
Expand All @@ -136,7 +149,9 @@ class PreprocessingConfig(PartialLoadConfig):
None, description="The path to an audio configuration file."
)
source_data: List[Dataset] = Field(
default_factory=lambda: [Dataset()],
default_factory=lambda: [
Dataset(permissions_obtained=True)
], # The default factory doesn't actually point to any data, so we can treat it as having permissions obtained
description="A list of datasets.",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ source_data:
filelist: ../r-filelist.psv
filelist_loader: everyvoice.utils.generic_psv_filelist_reader
label: dataset_0
permissions_obtained: true
sox_effects:
- [channel, '1']
train_split: 0.9
1 change: 1 addition & 0 deletions everyvoice/tests/preprocessed_audio_fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class PreprocessedAudioFixture:
Dataset(
data_dir=wavs_dir,
filelist=data_dir / "metadata.psv",
permissions_obtained=True,
)
],
),
Expand Down
7 changes: 6 additions & 1 deletion everyvoice/tests/test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,12 @@ def test_shared_sox(self) -> None:
vocoder_config = VocoderConfig(
contact=self.contact,
preprocessing=PreprocessingConfig(
source_data=[Dataset(), Dataset(), Dataset(), Dataset()]
source_data=[
Dataset(permissions_obtained=True),
Dataset(permissions_obtained=True),
Dataset(permissions_obtained=True),
Dataset(permissions_obtained=True),
]
),
)
config: EveryVoiceConfig = EveryVoiceConfig(
Expand Down
24 changes: 18 additions & 6 deletions everyvoice/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def test_run_doctest(self):
def test_read_filelist(self):
self.assertEqual(self.filelist[0]["basename"], "LJ050-0269")

def test_no_permissions(self):
no_permissions_args = self.fp_config.model_dump()
no_permissions_args["preprocessing"]["source_data"][0][
"permissions_obtained"
] = False
with self.assertRaises(ValueError):
FeaturePredictionConfig(**no_permissions_args)

def test_process_audio_for_alignment(self):
config = AlignerConfig(contact=self.contact)
for entry in self.filelist[1:]:
Expand Down Expand Up @@ -392,9 +400,9 @@ def test_text_processing(self):
preprocessed_dir.mkdir(parents=True, exist_ok=True)
output_filelist = preprocessed_dir / "preprocessed_filelist.psv"
shutil.copyfile(filelist_test_info["path"], output_filelist)
fp_config.preprocessing.source_data[0].filelist = (
filelist_test_info["path"]
)
fp_config.preprocessing.source_data[
0
].filelist = filelist_test_info["path"]
fp_config.preprocessing.save_dir = preprocessed_dir
preprocessor = Preprocessor(fp_config)
with capture_stdout() as output, mute_logger(
Expand Down Expand Up @@ -489,9 +497,13 @@ def test_incremental_preprocess(self):
with tempfile.TemporaryDirectory(
prefix="test_incremental_preprocess", dir="."
) as tmpdir:
fp_config, lj_filelist, full_filelist, partial_filelist, to_process = (
self.get_simple_config(tmpdir)
)
(
fp_config,
lj_filelist,
full_filelist,
partial_filelist,
to_process,
) = self.get_simple_config(tmpdir)

fp_config.preprocessing.source_data[0].filelist = partial_filelist
with capture_stdout() as output, mute_logger("everyvoice.preprocessor"):
Expand Down
1 change: 1 addition & 0 deletions everyvoice/wizard/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def effect(self):
filelist=new_filelist_path,
filelist_loader=filelist_loader,
sox_effects=sox_effects,
permissions_obtained=True, # If you get this far, you've answered the Dataset Permission Attestation step correctly
)
)
text_config = TextConfig(symbols=Symbols(**symbols))
Expand Down
Loading