Spaces:

harveysamson
/

wav2vec2-speech-emotion-recognition

Runtime error

App Files Files Community

harveysamson commited on Mar 27, 2022

Commit

6886c22

1 Parent(s): 5851ff3

added models and inference

Browse files

Files changed (14) hide show

__pycache__/models.cpython-39.pyc +0 -0
app.py +36 -0
data/test_audio.wav +0 -0
data/test_audio_2.wav +0 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-39.pyc +0 -0
src/__pycache__/collator.cpython-39.pyc +0 -0
src/__pycache__/modeling_outputs.cpython-39.pyc +0 -0
src/__pycache__/models.cpython-39.pyc +0 -0
src/__pycache__/trainer.cpython-39.pyc +0 -0
src/collator.py +58 -0
src/modeling_outputs.py +12 -0
src/models.py +114 -0
src/trainer.py +62 -0

__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (5.59 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoConfig, Wav2Vec2FeatureExtractor
+from src.models import Wav2Vec2ForSpeechClassification
+import gradio as gr
+import librosa
+device = torch.device("cpu")
+model_name_or_path = "harshit345/xlsr-wav2vec-speech-emotion-recognition"
+config = AutoConfig.from_pretrained(model_name_or_path)
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
+sampling_rate = feature_extractor.sampling_rate
+model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
+def load_data(path):
+    speech, sampling_rate = librosa.load(path)
+    if len(speech.shape) > 1:
+        speech = speech[:,0] + speech[:,1]
+    if sampling_rate != 16000:
+        speech = librosa.resample(speech, sampling_rate,16000)
+    return speech
+def inference(path):
+    speech = load_data(path)
+    inputs = feature_extractor(speech, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = model(inputs).logits
+    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+    outputs = {config.id2label[i]: float(round(score,2)) for i, score in enumerate(scores)}
+    return outputs
+examples = ['data/test_audio.wav', 'data/test_audio_2.wav']
+inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="microphone")
+outputs = gr.outputs.Label(type="confidences", label = "Output Scores")
+iface = gr.Interface(inference, inputs, outputs=["label"], examples=examples)
+iface.launch(debug=True)

data/test_audio.wav ADDED Viewed

Binary file (505 kB). View file

data/test_audio_2.wav ADDED Viewed

Binary file (538 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (149 Bytes). View file

src/__pycache__/collator.cpython-39.pyc ADDED Viewed

Binary file (3.29 kB). View file

src/__pycache__/modeling_outputs.cpython-39.pyc ADDED Viewed

Binary file (700 Bytes). View file

src/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (3.42 kB). View file

src/__pycache__/trainer.cpython-39.pyc ADDED Viewed

Binary file (2.04 kB). View file

src/collator.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+import torch
+import transformers
+from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`)
+            The feature_extractor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    feature_extractor: Wav2Vec2FeatureExtractor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [feature["labels"] for feature in features]
+        d_type = torch.long if isinstance(label_features[0], int) else torch.float
+        batch = self.feature_extractor.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        batch["labels"] = torch.tensor(label_features, dtype=d_type)
+        return batch

src/modeling_outputs.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from transformers.file_utils import ModelOutput
+@dataclass
+class SpeechClassifierOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None

src/models.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2PreTrainedModel,
+    Wav2Vec2Model
+)
+from src.modeling_outputs import SpeechClassifierOutput
+class Wav2Vec2ClassificationHead(nn.Module):
+    """Head for wav2vec classification task."""
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.pooling_mode = config.pooling_mode
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = Wav2Vec2ClassificationHead(config)
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        self.wav2vec2.feature_extractor._freeze_parameters()
+    def merged_strategy(
+            self,
+            hidden_states,
+            mode="mean"
+    ):
+        if mode == "mean":
+            outputs = torch.mean(hidden_states, dim=1)
+        elif mode == "sum":
+            outputs = torch.sum(hidden_states, dim=1)
+        elif mode == "max":
+            outputs = torch.max(hidden_states, dim=1)[0]
+        else:
+            raise Exception(
+                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
+        return outputs
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SpeechClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

src/trainer.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Any, Dict, Union
+import torch
+from packaging import version
+from torch import nn
+from transformers import (
+    Trainer,
+    is_apex_available,
+)
+if is_apex_available():
+    from apex import amp
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_native_amp_available = True
+    from torch.cuda.amp import autocast
+class CTCTrainer(Trainer):
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (:obj:`nn.Module`):
+                The model to train.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+        Return:
+            :obj:`torch.Tensor`: The tensor with training loss on this batch.
+        """
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        if self.use_amp:
+            with autocast():
+                loss = self.compute_loss(model, inputs)
+        else:
+            loss = self.compute_loss(model, inputs)
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+        if self.use_amp:
+            self.scaler.scale(loss).backward()
+        elif self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        elif self.deepspeed:
+            self.deepspeed.backward(loss)
+        else:
+            loss.backward()
+        return loss.detach()