o
    iJ                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ dededefddZdedede
jfddZdd Z	dddZ	dddZ		dddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ] \}}}||v r=|d | |d | |d ||  q|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S      i | ]\}}||qS  r   .0Z	array_idxZ
column_idxr   r   l/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>.       
z)_split_sparse_columns.<locals>.<dictcomp>r      r   )list	enumeratezipappend)r   r   Zarff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s   "r   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]\}}}||v r<||||| f< q+|S )Nr   c                 S   r   r   r   r   r   r   r   r   @   r   z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsZy_shaper   yr   r   r   r   r   r   _sparse_data_to_array9   s   "r)   c                 C   sP   | | }t |dkr| | }||fS t |dkr"| |d  }||fS d}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r#   )frameZfeature_namesZtarget_namesXr(   r   r   r   _post_process_frameK   s   r,   c           "         s  dd }|| }|dkrt jnt j}|dk }	t j|||	d}
|| fdd|
d D  |dkrtd	}t|
d }t| }t|
d
 }|j	|g|dd}|j
dd }t|}fdd|D }|| g}t|
d
 |D ]}||j	||dd|  qrt|dkr|d |d j|d< |j|dd}t||}~~i }|jD ]%}| d }| dkrd||< q| dkrd||< q|j| ||< q||}t|||\}n|
d
 }fdd|D }fdd|D }t|tr3|du rtd|d dkr	d}n|d |d  }tjtj|d|d }|j| }|dd|f }|dd|f n@t|t rjt!||}t"|d d }|t|f} t#j$j%|d |d |d ff| tj&d!}|' }t(||n	td"t)|  fd#d$|D }!|!snt*|!rt+ fd%dt,|D n	t-|!rtd&j.d dkrd'n
j.d dkrd|dkr||dfS |d fS )(a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c                 s   s    | D ]}| dV  qd S )Nutf-8)decode)	gzip_fileliner   r   r   _io_to_generator   s   z+_liac_arff_parser.<locals>._io_to_generatorsparsepandas)return_typeencode_nominalc                    s(   i | ]\}}t |tr| v r||qS r   )
isinstancer   )r   namecatcolumns_to_selectr   r   r      s    z%_liac_arff_parser.<locals>.<dictcomp>
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepc                       g | ]}| v r|qS r   r   r   colr9   r   r   
<listcomp>       z%_liac_arff_parser.<locals>.<listcomp>r   r   r   )Zignore_index	data_typeintegerInt64nominalcategoryc                       g | ]
}t  | d  qS indexintr   col_nameopenml_columns_infor   r   rC          c                    rJ   rK   rM   rO   rQ   r   r   rC      rS   Nz6shape must be provided when arr['data'] is a Generatorr&   )r!   count)shaper!   z-Unexpected type for data obtained from arff: c                    s   h | ]}| v qS r   r   rO   )
categoriesr   r   	<setcomp>  s    z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]!\}}t t j |d ddd||d f jtddqS )Or    Nr   F)r>   )r$   ZtakeZasarraypopastyperN   )r   irP   )rW   r(   r   r   rC     s     zAMix of nominal and non-nominal targets is not currently supported)rT   )/r   ZCOOZ	DENSE_GENloadr
   r   r   keysnextZ	DataFrameZmemory_usagesumr	   r   r   r#   r[   dtypesconcatr   r=   lowerr,   r6   r   
ValueErrorr$   Zfromiter	itertoolschainfrom_iterableZreshapetupler   r"   spr2   Z
coo_matrixr&   Ztocsrr)   typeallZhstackr   anyrV   )"r/   output_arrays_typerR   feature_names_to_selecttarget_names_to_selectrV   r1   streamr4   r5   Zarff_containerpdZcolumns_infoZcolumns_names	first_rowZfirst_dfZ	row_bytes	chunksizecolumns_to_keepdfsr<   r*   ra   r7   column_dtyper+   r   Zfeature_indices_to_selectZtarget_indices_to_selectrU   Zarff_data_Xr'   ZX_shapeZis_classificationr   )rW   r:   rR   r(   r   _liac_arff_parserk   s   7
















	
rw   c              
      s  ddl | D ]}|d dr nqi |D ]}|| d }| dkr,d|< q| dkr6d	|< qfd
dt|D }	dddgddddd|	d	}
i |
|pUi }j| fi |}z
dd |D |_W n ty} } zj	d|d}~ww ||   fdd|jD }|| }t
dfdd}fdd|j D }|D ]}|| j|||< qt|||\}}|dkr|||dfS | | }}fdd|j D }||d|fS )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr-   z@datarE   rF   rG   rH   rI   c                    s"   i | ]\}}| v r| | qS r   r   )r   r   r7   )ra   r   r   r     s
    z'_pandas_arff_parser.<locals>.<dictcomp>F?%"T\)	headerZ	index_colZ	na_valuesZkeep_default_nacomment	quotecharskipinitialspace
escapecharr!   c                 S   s   g | ]}|qS r   r   )r   r7   r   r   r   rC     s    z'_pandas_arff_parser.<locals>.<listcomp>zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.c                    r@   r   r   rA   r9   r   r   rC     rD   z^'(?P<contents>.*)'$c                    s"   t  | }|d u r| S |dS )Ncontents)researchgroup)Zinput_stringmatch)single_quote_patternr   r   strip_single_quotes  s   
z0_pandas_arff_parser.<locals>.strip_single_quotesc                    s    g | ]\}}t | jr|qS r   )r6   CategoricalDtyper   r7   r!   rq   r   r   rC     s    
r3   c                    s(   i | ]\}}t | jr||j qS r   )r6   r   rW   tolistr   r   r   r   r     s    

)r3   r.   rc   
startswithr   Zread_csvr=   rd   errorsZParserErrorr   compilera   itemsr8   Zrename_categoriesr,   Zto_numpy)r/   rm   rR   rn   ro   read_csv_kwargsr0   r7   rv   Zdtypes_positionalZdefault_read_csv_kwargsr*   excrt   r   Zcategorical_columnsrB   r+   r(   rW   r   )r:   ra   rq   r   r   _pandas_arff_parser7  sp   8




r   c                 C   sD   |dkrt | |||||S |dkrt| |||||S td| d)a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr3   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)rw   r   rd   )r/   parseroutput_typerR   rn   ro   rV   r   r   r   r   load_arff_from_gzip_file  s*   ;	
r   )N)NN)__doc__re   r   collectionsr   collections.abcr   typingr   numpyr$   Zscipyri   Z	externalsr   Zexternals._arffr   Zutils._chunkingr   r	   Zutils._optional_dependenciesr
   Zutils.fixesr   r   Zndarrayr)   r,   rw   r   r   r   r   r   r   <module>   sJ    
#
&
 S
  