o
    *i1                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ dZd	Zd
Zg dZddddddddZdd Zdedededededee deeeeeeef fddZG dd  d e
ZdS )!    N)Path)ListTupleUnion)Tensor)Dataset)download_url_to_file)_extract_tar_load_waveformtrain-clean-100ZLibriSpeechi>  )z	dev-cleanz	dev-otherz
test-cleanz
test-otherr   ztrain-clean-360ztrain-other-500Z@76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3Z@12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365Z@39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23Z@d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29Z@d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2Z@146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecfZ@ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2)z4http://www.openslr.org/resources/12/dev-clean.tar.gzz4http://www.openslr.org/resources/12/dev-other.tar.gzz5http://www.openslr.org/resources/12/test-clean.tar.gzz5http://www.openslr.org/resources/12/test-other.tar.gzz:http://www.openslr.org/resources/12/train-clean-100.tar.gzz:http://www.openslr.org/resources/12/train-clean-360.tar.gzz:http://www.openslr.org/resources/12/train-other-500.tar.gzc                 C   s^   d}d}|| }t j| |}t j||}t j|s)t|d }t|||d t| d S )Nz$http://www.openslr.org/resources/12/z.tar.gz)Zhash_prefix)ospathjoinisfile
_CHECKSUMSgetr   r	   )rooturlbase_urlZext_archivefilenamearchiveZdownload_urlZchecksum r   v/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/datasets/librispeech_biasing.py_download_librispeech!   s   r   fileidr   folder	ext_audioext_txtblistreturnc                 C   s  |pg }|  d\}}}| d| d| }	tj||||	 | }
| d| | }tj|||||}g }t|8}|D ]&}|  dd\}}|	|kre|  D ]}||v rb||vrb|| qS nq?td|	 W d    n1 sww   Y  |
t|t	|t	|t	||fS )N-    zTranslation not found for )
splitr   r   r   openstripappendFileNotFoundErrorSAMPLE_RATEint)r   r   r   r   r   r   Z
speaker_idZ
chapter_idZutterance_idZfileid_audiofilepathZ	file_textZuttblistftlineZfileid_textZ
transcriptwordr   r   r   _get_librispeech_metadata.   s8   

r.   c                   @   s   e Zd ZdZdZdZeeddfdee	e
f de	de	d	ed
ee	 ddfddZdedeeee	eeef fddZdedeeee	eeef fddZdefddZdS )LibriSpeechBiasinga  *LibriSpeech* :cite:`7178964` dataset with prefix-tree construction and biasing support.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        url (str, optional): The URL to download the dataset from,
            or the type of the dataset to dowload.
            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
            ``"train-other-500"``. (default: ``"train-clean-100"``)
        folder_in_archive (str, optional):
            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
        blist (list, optional):
            The list of biasing words (default: ``[]``).
    z
.trans.txtz.flacFNr   r   folder_in_archivedownloadr   r   c                 C   s   || _ |tvrtd| dt dt|}tj||| _tj|||| _tj	| js@|r7t
|| n	td| j dtdd t| jd| j D | _|| _d S )	NzInvalid url 'z' given; please provide one of .zDataset not found at z5. Please set `download=True` to download the dataset.c                 s   s    | ]}t |jV  qd S N)strstem).0pr   r   r   	<genexpr>   s    z.LibriSpeechBiasing.__init__.<locals>.<genexpr>z*/*/*)_url_DATA_SUBSETS
ValueErrorr   fspathr   r   _archive_pathisdirr   RuntimeErrorsortedr   glob
_ext_audio_walkerr   )selfr   r   r0   r1   r   r   r   r   __init__i   s   
&
zLibriSpeechBiasing.__init__nc                 C   s&   | j | }t|| j| j| j| j| jS )a  Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
        but otherwise returns the same fields as :py:func:`__getitem__`.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            str:
                Path to audio
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
            list:
                List of biasing words in the utterance
        )rD   r.   r=   r9   rC   _ext_txtr   )rE   rG   r   r   r   r   get_metadata   s   
zLibriSpeechBiasing.get_metadatac                 C   s2   |  |}t| j|d |d }|f|dd  S )a
  Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
            list:
                List of biasing words in the utterance
        r   r"   N)rI   r
   r=   )rE   rG   metadataZwaveformr   r   r   __getitem__   s   
zLibriSpeechBiasing.__getitem__c                 C   s
   t | jS r3   )lenrD   )rE   r   r   r   __len__   s   
zLibriSpeechBiasing.__len__)__name__
__module____qualname____doc__rH   rC   URLFOLDER_IN_ARCHIVEr   r4   r   boolr   rF   r)   r   r   rI   rK   rM   r   r   r   r   r/   T   s0    

""r/   )r   pathlibr   typingr   r   r   Ztorchr   Ztorch.utils.datar   Ztorchaudio._internalr   Ztorchaudio.datasets.utilsr	   r
   rR   rS   r(   r:   r   r   r4   r)   r.   r/   r   r   r   r   <module>   sH    

&