Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support non-identical file names between .wav and .eaf, and recognise media offsets #215

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions elpis/gui/.eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
"eslint:recommended",
"plugin:react/recommended",
],
env: {
es6: true,
},
settings: {
react: {
version: "detect",
Expand Down
70 changes: 68 additions & 2 deletions elpis/gui/src/components/Dataset/FileUpload.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,64 @@
import React, {Component} from "react";
import classNames from "classnames";
import Dropzone from "react-dropzone";
import {Button} from "semantic-ui-react";
import {Button, Message, MessageHeader} from "semantic-ui-react";
import {fromEvent} from "file-selector";
import {withTranslation} from "react-i18next";
import {datasetFiles} from "redux/actions/datasetActions";
import {connect} from "react-redux";

class FileUpload extends Component {
onDrop = (acceptedFiles) => {
constructor(props) {
super(props);
this.state = {missingFiles: []};
}

parseElan = async (file) => {
return new Promise((resolve, reject) => {
const reader = new window.FileReader();

reader.readAsText(file);
reader.onload = () => {
const parser = new window.DOMParser();
const eafDoc = parser.parseFromString(reader.result, "application/xml");
const wavUrl = eafDoc
.getElementsByTagName("ANNOTATION_DOCUMENT")[0]
.getElementsByTagName("HEADER")[0]
.getElementsByTagName("MEDIA_DESCRIPTOR")[0]
.getAttribute("RELATIVE_MEDIA_URL").split("./")[1];

resolve(wavUrl);
};
reader.onerror = () => {
reject(reader.error);
};
});
}

onDrop = async (acceptedFiles) => {
// TODO: Behaviour when dropping multiple times is still undefined
// (do we empty the dataset?)
this.setState({missingFiles: []});
console.log("files dropped:", acceptedFiles);

const eafFiles = acceptedFiles
.filter(file => file.name.split(".").pop() === "eaf");
const wavFileNames = acceptedFiles
.filter(file => file.name.split(".").pop() === "wav")
.map(file => file.name);

// for each is not supported with await for some reason...
for (let i = 0; i < eafFiles.length; i++) {
const parsedWavFile = await this.parseElan(eafFiles[i]);
const identicalWavFile = eafFiles[i].name.split(".")[0].concat(".wav");

if (!wavFileNames.includes(parsedWavFile) && !wavFileNames.includes(identicalWavFile)) {
this.setState(prevState => ({
missingFiles: [...prevState.missingFiles, [identicalWavFile, parsedWavFile]],
}));
}
}

var formData = new FormData();

acceptedFiles.forEach(file => {
Expand All @@ -25,6 +73,24 @@ class FileUpload extends Component {

return (
<div className="FileUpload">
{this.state.missingFiles.length > 0 &&
<Message negative>
<MessageHeader>
{t("dataset.files.missingAudioFiles")}
</MessageHeader>
<p>
{t("dataset.files.missingAudioFilesDescription")}
</p>
<ul>
{this.state.missingFiles.map(wavFile => {
return (
<li key={wavFile[0]}>
{wavFile[0]} / {wavFile[1]}
</li>
);
})}
</ul>
</Message>}
<Dropzone
disabled={interactionDisabled}
className="dropzone"
Expand Down
4 changes: 3 additions & 1 deletion elpis/gui/src/translations/en-GB/common.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@
"tierNameDescription": "or use a tier name.",
"puncDescription": "Punctuation will be removed from the transcriptions. Add punctuation marks here that you want to be replaced by spaces when removed.",
"uploadButton": "Upload",
"saveButton": "Save Settings"
"saveButton": "Save Settings",
"missingAudioFiles": "Missing audio files",
"missingAudioFilesDescription": "The following audio files are missing:"
},
"fileUpload": {
"audioLabel": "Audio files:",
Expand Down
4 changes: 3 additions & 1 deletion elpis/gui/src/translations/fr/common.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@
"tierNameDescription": "ou utilisez un nom d’échelon.",
"puncDescription": "La ponctuation sera supprimée des transcriptions. Ajoutez ici les signes de ponctuation que vous désirez remplacer par des espaces.",
"uploadButton": "Téléverser",
"saveButton": "Enregistrer les paramètres"
"saveButton": "Enregistrer les paramètres",
"missingAudioFiles": "Fichiers audio manquants",
"missingAudioFilesDescription": "Les fichiers audio suivants sont manquants:"
},
"fileUpload": {
"audioLabel": "Fichiers audio :",
Expand Down
15 changes: 12 additions & 3 deletions elpis/transformer/elan.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,20 @@ def import_eaf_file(eaf_paths: List[str],
end = annotation[1]
annotation = annotation[2]

time_origin = input_eaf.get_linked_files()[0].get("TIME_ORIGIN")
offset = int(time_origin) if time_origin is not None else 0

# If matching file, use that. Otherwise, fall back to the relative media url.
if os.path.isfile(f"{file_name}.wav"):
audio_file_name = f"{file_name}.wav"
else:
audio_file_name = input_eaf.get_linked_files()[0].get("RELATIVE_MEDIA_URL")

utterance = {
"audio_file_name": f"{file_name}.wav",
"audio_file_name": audio_file_name,
"transcript": annotation,
"start_ms": start,
"stop_ms": end,
"start_ms": start + offset,
"stop_ms": end + offset,
"speaker_id": speaker_id
}

Expand Down