Source code for pyannote.metrics.detection

#!/usr/bin/env python
# encoding: utf-8

# The MIT License (MIT)

# Copyright (c) 2012-2020 CNRS

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# AUTHORS
# Hervé BREDIN - http://herve.niderb.fr
# Marvin LAVECHIN

from .base import BaseMetric, f_measure
from .utils import UEMSupportMixin

DER_NAME = 'detection error rate'
DER_TOTAL = 'total'
DER_FALSE_ALARM = 'false alarm'
DER_MISS = 'miss'


[docs]class DetectionErrorRate(UEMSupportMixin, BaseMetric): """Detection error rate This metric can be used to evaluate binary classification tasks such as speech activity detection, for instance. Inputs are expected to only contain segments corresponding to the positive class (e.g. speech regions). Gaps in the inputs considered as the negative class (e.g. non-speech regions). It is computed as (fa + miss) / total, where fa is the duration of false alarm (e.g. non-speech classified as speech), miss is the duration of missed detection (e.g. speech classified as non-speech), and total is the total duration of the positive class in the reference. Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). """ @classmethod def metric_name(cls): return DER_NAME @classmethod def metric_components(cls): return [DER_TOTAL, DER_FALSE_ALARM, DER_MISS] def __init__(self, collar=0.0, skip_overlap=False, **kwargs): super(DetectionErrorRate, self).__init__(**kwargs) self.collar = collar self.skip_overlap = skip_overlap
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() reference_ = reference.gaps(support=uem) hypothesis_ = hypothesis.gaps(support=uem) false_positive = 0. for r_, h in reference_.co_iter(hypothesis): false_positive += (r_ & h).duration false_negative = 0. for r, h_ in reference.co_iter(hypothesis_): false_negative += (r & h_).duration detail = {} detail[DER_MISS] = false_negative detail[DER_FALSE_ALARM] = false_positive detail[DER_TOTAL] = reference.duration() return detail
[docs] def compute_metric(self, detail): error = 1. * (detail[DER_FALSE_ALARM] + detail[DER_MISS]) total = 1. * detail[DER_TOTAL] if total == 0.: if error == 0: return 0. else: return 1. else: return error / total
ACCURACY_NAME = 'detection accuracy' ACCURACY_TRUE_POSITIVE = 'true positive' ACCURACY_TRUE_NEGATIVE = 'true negative' ACCURACY_FALSE_POSITIVE = 'false positive' ACCURACY_FALSE_NEGATIVE = 'false negative'
[docs]class DetectionAccuracy(DetectionErrorRate): """Detection accuracy This metric can be used to evaluate binary classification tasks such as speech activity detection, for instance. Inputs are expected to only contain segments corresponding to the positive class (e.g. speech regions). Gaps in the inputs considered as the negative class (e.g. non-speech regions). It is computed as (tp + tn) / total, where tp is the duration of true positive (e.g. speech classified as speech), tn is the duration of true negative (e.g. non-speech classified as non-speech), and total is the total duration of the input signal. Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). """ @classmethod def metric_name(cls): return ACCURACY_NAME @classmethod def metric_components(cls): return [ACCURACY_TRUE_POSITIVE, ACCURACY_TRUE_NEGATIVE, ACCURACY_FALSE_POSITIVE, ACCURACY_FALSE_NEGATIVE]
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() reference_ = reference.gaps(support=uem) hypothesis_ = hypothesis.gaps(support=uem) true_positive = 0. for r, h in reference.co_iter(hypothesis): true_positive += (r & h).duration true_negative = 0. for r_, h_ in reference_.co_iter(hypothesis_): true_negative += (r_ & h_).duration false_positive = 0. for r_, h in reference_.co_iter(hypothesis): false_positive += (r_ & h).duration false_negative = 0. for r, h_ in reference.co_iter(hypothesis_): false_negative += (r & h_).duration detail = {} detail[ACCURACY_TRUE_NEGATIVE] = true_negative detail[ACCURACY_TRUE_POSITIVE] = true_positive detail[ACCURACY_FALSE_NEGATIVE] = false_negative detail[ACCURACY_FALSE_POSITIVE] = false_positive return detail
[docs] def compute_metric(self, detail): numerator = 1. * (detail[ACCURACY_TRUE_NEGATIVE] + detail[ACCURACY_TRUE_POSITIVE]) denominator = 1. * (detail[ACCURACY_TRUE_NEGATIVE] + detail[ACCURACY_TRUE_POSITIVE] + detail[ACCURACY_FALSE_NEGATIVE] + detail[ACCURACY_FALSE_POSITIVE]) if denominator == 0.: return 1. else: return numerator / denominator
PRECISION_NAME = 'detection precision' PRECISION_RETRIEVED = 'retrieved' PRECISION_RELEVANT_RETRIEVED = 'relevant retrieved'
[docs]class DetectionPrecision(DetectionErrorRate): """Detection precision This metric can be used to evaluate binary classification tasks such as speech activity detection, for instance. Inputs are expected to only contain segments corresponding to the positive class (e.g. speech regions). Gaps in the inputs considered as the negative class (e.g. non-speech regions). It is computed as tp / (tp + fp), where tp is the duration of true positive (e.g. speech classified as speech), and fp is the duration of false positive (e.g. non-speech classified as speech). Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). """ @classmethod def metric_name(cls): return PRECISION_NAME @classmethod def metric_components(cls): return [PRECISION_RETRIEVED, PRECISION_RELEVANT_RETRIEVED]
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() reference_ = reference.gaps(support=uem) true_positive = 0. for r, h in reference.co_iter(hypothesis): true_positive += (r & h).duration false_positive = 0. for r_, h in reference_.co_iter(hypothesis): false_positive += (r_ & h).duration detail = {} detail[PRECISION_RETRIEVED] = true_positive + false_positive detail[PRECISION_RELEVANT_RETRIEVED] = true_positive return detail
[docs] def compute_metric(self, detail): relevant_retrieved = 1. * detail[PRECISION_RELEVANT_RETRIEVED] retrieved = 1. * detail[PRECISION_RETRIEVED] if retrieved == 0.: return 1. else: return relevant_retrieved / retrieved
RECALL_NAME = 'detection recall' RECALL_RELEVANT = 'relevant' RECALL_RELEVANT_RETRIEVED = 'relevant retrieved'
[docs]class DetectionRecall(DetectionErrorRate): """Detection recall This metric can be used to evaluate binary classification tasks such as speech activity detection, for instance. Inputs are expected to only contain segments corresponding to the positive class (e.g. speech regions). Gaps in the inputs considered as the negative class (e.g. non-speech regions). It is computed as tp / (tp + fn), where tp is the duration of true positive (e.g. speech classified as speech), and fn is the duration of false negative (e.g. speech classified as non-speech). Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). """ @classmethod def metric_name(cls): return RECALL_NAME @classmethod def metric_components(cls): return [RECALL_RELEVANT, RECALL_RELEVANT_RETRIEVED]
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() hypothesis_ = hypothesis.gaps(support=uem) true_positive = 0. for r, h in reference.co_iter(hypothesis): true_positive += (r & h).duration false_negative = 0. for r, h_ in reference.co_iter(hypothesis_): false_negative += (r & h_).duration detail = {} detail[RECALL_RELEVANT] = true_positive + false_negative detail[RECALL_RELEVANT_RETRIEVED] = true_positive return detail
[docs] def compute_metric(self, detail): relevant_retrieved = 1. * detail[RECALL_RELEVANT_RETRIEVED] relevant = 1. * detail[RECALL_RELEVANT] if relevant == 0.: if relevant_retrieved == 0: return 1. else: return 0. else: return relevant_retrieved / relevant
DFS_NAME = 'F[precision|recall]' DFS_PRECISION_RETRIEVED = 'retrieved' DFS_RECALL_RELEVANT = 'relevant' DFS_RELEVANT_RETRIEVED = 'relevant retrieved'
[docs]class DetectionPrecisionRecallFMeasure(UEMSupportMixin, BaseMetric): """Compute detection precision and recall, and return their F-score Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). beta : float, optional When beta > 1, greater importance is given to recall. When beta < 1, greater importance is given to precision. Defaults to 1. See also -------- pyannote.metrics.detection.DetectionPrecision pyannote.metrics.detection.DetectionRecall pyannote.metrics.base.f_measure """ @classmethod def metric_name(cls): return DFS_NAME @classmethod def metric_components(cls): return [DFS_PRECISION_RETRIEVED, DFS_RECALL_RELEVANT, DFS_RELEVANT_RETRIEVED] def __init__(self, collar=0.0, skip_overlap=False, beta=1., **kwargs): super(DetectionPrecisionRecallFMeasure, self).__init__(**kwargs) self.collar = collar self.skip_overlap = skip_overlap self.beta = beta
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() reference_ = reference.gaps(support=uem) hypothesis_ = hypothesis.gaps(support=uem) # Better to recompute everything from scratch instead of calling the # DetectionPrecision & DetectionRecall classes (we skip one of the loop # that computes the amount of true positives). true_positive = 0. for r, h in reference.co_iter(hypothesis): true_positive += (r & h).duration false_positive = 0. for r_, h in reference_.co_iter(hypothesis): false_positive += (r_ & h).duration false_negative = 0. for r, h_ in reference.co_iter(hypothesis_): false_negative += (r & h_).duration detail = {DFS_PRECISION_RETRIEVED: true_positive + false_positive, DFS_RECALL_RELEVANT: true_positive + false_negative, DFS_RELEVANT_RETRIEVED: true_positive} return detail
[docs] def compute_metric(self, detail): _, _, value = self.compute_metrics(detail=detail) return value
def compute_metrics(self, detail=None): detail = self.accumulated_ if detail is None else detail precision_retrieved = detail[DFS_PRECISION_RETRIEVED] recall_relevant = detail[DFS_RECALL_RELEVANT] relevant_retrieved = detail[DFS_RELEVANT_RETRIEVED] # Special cases : precision if precision_retrieved == 0.: precision = 1 else: precision = relevant_retrieved / precision_retrieved # Special cases : recall if recall_relevant == 0.: if relevant_retrieved == 0: recall = 1. else: recall = 0. else: recall = relevant_retrieved / recall_relevant return precision, recall, f_measure(precision, recall, beta=self.beta)
DCF_NAME = 'detection cost function' DCF_POS_TOTAL = 'positive class total' # Total duration of positive class. DCF_NEG_TOTAL = 'negative class total' # Total duration of negative class. DCF_FALSE_ALARM = 'false alarm' # Total duration of false alarms. DCF_MISS = 'miss' # Total duration of misses.
[docs]class DetectionCostFunction(UEMSupportMixin, BaseMetric): """Detection cost function. This metric can be used to evaluate binary classification tasks such as speech activity detection. Inputs are expected to only contain segments corresponding to the positive class (e.g. speech regions). Gaps in the inputs considered as the negative class (e.g. non-speech regions). Detection cost function (DCF), as defined by NIST for OpenSAT 2019, is 0.25*far + 0.75*missr, where far is the false alarm rate (i.e., the proportion of non-speech incorrectly classified as speech) and missr is the miss rate (i.e., the proportion of speech incorrectly classified as non-speech. Parameters ---------- collar : float, optional Duration (in seconds) of collars removed from evaluation around boundaries of reference segments (one half before, one half after). Defaults to 0.0. skip_overlap : bool, optional Set to True to not evaluate overlap regions. Defaults to False (i.e. keep overlap regions). fa_weight : float, optional Weight for false alarm rate. Defaults to 0.25. miss_weight : float, optional Weight for miss rate. Defaults to 0.75. kwargs Keyword arguments passed to :class:`pyannote.metrics.base.BaseMetric`. References ---------- "OpenSAT19 Evaluation Plan v2." https://www.nist.gov/system/files/documents/2018/11/05/opensat19_evaluation_plan_v2_11-5-18.pdf """ def __init__(self, collar=0.0, skip_overlap=False, fa_weight=0.25, miss_weight=0.75, **kwargs): super(DetectionCostFunction, self).__init__(**kwargs) self.collar = collar self.skip_overlap = skip_overlap self.fa_weight = fa_weight self.miss_weight = miss_weight @classmethod def metric_name(cls): return DCF_NAME @classmethod def metric_components(cls): return [DCF_POS_TOTAL, DCF_NEG_TOTAL, DCF_FALSE_ALARM, DCF_MISS]
[docs] def compute_components(self, reference, hypothesis, uem=None, **kwargs): reference, hypothesis, uem = self.uemify( reference, hypothesis, uem=uem, collar=self.collar, skip_overlap=self.skip_overlap, returns_uem=True) # Obtain timelines corresponding to positive class. reference = reference.get_timeline(copy=False).support() hypothesis = hypothesis.get_timeline(copy=False).support() # Obtain timelines corresponding to negative class. reference_ = reference.gaps(support=uem) hypothesis_ = hypothesis.gaps(support=uem) # Compute total positive/negative durations. pos_dur = reference.duration() neg_dur = reference_.duration() # Compute total miss duration. miss_dur = 0.0 for r, h_ in reference.co_iter(hypothesis_): miss_dur += (r & h_).duration # Compute total false alarm duration. fa_dur = 0.0 for r_, h in reference_.co_iter(hypothesis): fa_dur += (r_ & h).duration components = { DCF_POS_TOTAL : pos_dur, DCF_NEG_TOTAL : neg_dur, DCF_MISS : miss_dur, DCF_FALSE_ALARM : fa_dur} return components
[docs] def compute_metric(self, components): def _compute_rate(num, denom): if denom == 0.0: if num == 0.0: return 0.0 return 1.0 return num/denom # Compute false alarm rate. neg_dur = components[DCF_NEG_TOTAL] fa_dur = components[DCF_FALSE_ALARM] fa_rate = _compute_rate(fa_dur, neg_dur) # Compute miss rate. pos_dur = components[DCF_POS_TOTAL] miss_dur = components[DCF_MISS] miss_rate = _compute_rate(miss_dur, pos_dur) return self.fa_weight*fa_rate + self.miss_weight*miss_rate