o
    *iS                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d	Zd
ZdZdgdgddgdZdedee dedeeeef  fddZG dd deZdS )    N)Path)ListTupleUnion)Tensor)Dataset)download_url_to_file)_get_librispeech_metadata)_extract_tarZlibrispeech_finetuningzIhttps://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgzZ@5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342afz1h/0z1h/*Z9h)10minZ1hZ10hpathfolders
_ext_audioreturnc                    s^   t   g }|D ]} fdd | d| D }|dd |D 7 }q|jdd d |S )a  Get the file names and the corresponding file paths without `speaker_id`
    and `chapter_id` directories.
    The format of path is like:
        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
        {root}/{_ARCHIVE_NAME}/9h/[clean, other]

    Args:
        path (Path): Root path to the dataset.
        folders (List[str]): Folders that contain the desired audio files.
        _ext_audio (str): Extension of audio files.

    Returns:
        List[Tuple[str, str]]:
            List of tuples where the first element is the relative path to the audio file.
            The format of relative path is like:
            1h/[0-5]/[clean, other] or 9h/[clean, other]
            The second element is the file name without audio extension.
    c                    s   g | ]}|  qS  )relative_to.0pr   r   u/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/datasets/librilight_limited.py
<listcomp>*   s    z&_get_fileids_paths.<locals>.<listcomp>z/*/*/*/*c                 S   s$   g | ]}t |jjjt |jfqS r   )strparentstemr   r   r   r   r   +   s   $ c                 S   s   | d | d  S )Nr      r   )xr   r   r   <lambda>,   s    z$_get_fileids_paths.<locals>.<lambda>)key)r   globsort)r   r   r   Zfiles_pathsfolderpathsr   r   r   _get_fileids_paths   s   "r#   c                
   @   sp   e Zd ZdZdZdZ		ddeeef dede	d	d
fddZ
ded	eeeeeeef fddZd	efddZd
S )LibriLightLimiteda  Subset of Libri-light :cite:`librilight` dataset,
    which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
            (Default: ``"10min"``).
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
    z
.trans.txtz.flacr   Frootsubsetdownloadr   Nc                 C   s   |t vrtdt   d| t | }t|}tj|t| _tj|t d}tj	| jsI|s8t
dtj|sEtt|td t| t| j|| j| _d S )Nz`subset` must be one of z	. Found: z.tgzz9Dataset not found. Please use `download=True` to download)Zhash_prefix)_SUBSET_MAP
ValueErrorkeysosfspathr   join_ARCHIVE_NAME_pathisdirRuntimeErrorisfiler   _URL	_CHECKSUMr
   r#   r   _fileids_paths)selfr%   r&   r'   r   archiver   r   r   __init__?   s   
zLibriLightLimited.__init__nc                 C   sT   | j | \}}t|| j|| j| j}ttj	| j|d \}}|f|dd  S )a  Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded
        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
        r   r   N)
r5   r	   r/   r   _ext_txt
torchaudioloadr+   r   r-   )r6   r9   	file_pathZfileidmetadataZwaveform_r   r   r   __getitem__T   s   zLibriLightLimited.__getitem__c                 C   s
   t | jS )N)lenr5   )r6   r   r   r   __len__n   s   
zLibriLightLimited.__len__)r   F)__name__
__module____qualname____doc__r:   r   r   r   r   boolr8   intr   r   r@   rB   r   r   r   r   r$   0   s"    

"r$   )r+   pathlibr   typingr   r   r   r;   Ztorchr   Ztorch.utils.datar   Ztorchaudio._internalr   Ztorchaudio.datasets.librispeechr	   Ztorchaudio.datasets.utilsr
   r.   r3   r4   r(   r   r#   r$   r   r   r   r   <module>   s    *