o
    i6                     @   sh  d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	m
Z
mZ ddlZddlZddlmZmZ ddlmZ dd	lmZmZmZ d
dlmZ d
dlmZmZmZmZ eddddZeddddZ e!e"Z#eeh ddge$ej%dgdgdgdgdgdgdgeed
dddgeeddddgd
dddddddddddd d
d!d"Z&	 d'd#d$Z'd%d& Z(dS )(zKDDCUP 99 dataset.

A classic dataset for anomaly detection.

The dataset page is available from UCI Machine Learning Repository

https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz

    N)GzipFile)IntegralReal)existsjoin   )Bunchcheck_random_state)shuffle)Interval
StrOptionsvalidate_params   )get_data_home)RemoteFileMetadata_convert_data_dataframe_fetch_remote
load_descrZkddcup99_dataz.https://ndownloader.figshare.com/files/5976045Z@3b6c942aa0356c0ca35b7b595a26c89d343652c9db428893e7494f837b274292)filenameurlZchecksumZkddcup99_10_dataz.https://ndownloader.figshare.com/files/5976042Z@8045aca0d84e70e622d1148d7df782496f6333bf6eb979a1b0837c42a9fd9561>   smtpSAhttpSFbooleanrandom_stateleft)closedg        Zneither)
subset	data_homer
   r   	percent10download_if_missing
return_X_yas_frame	n_retriesdelayT)Zprefer_skip_nested_validationF         ?c        
         C   s  t |d}t|||||	d}
|
j}|
j}|
j}|
j}| dkre|dk}t|}||ddf }|| }||ddf }|| }|jd }t	|}|
d|d}|| }|| }tj||f }tj||f }| dksr| d	ksr| d
kr|dddf dk}tj||ddf ||ddf f }|dd |dd  }|| }t|dddf d jtdd|dddf< t|dddf d jtdd|dddf< t|dddf d jtdd|dddf< | d	kr#|dddf dk}|| }|| }tj|dddf |dddf |dddf f }|d |d |d g}| d
kr_|dddf dk}|| }|| }tj|dddf |dddf |dddf f }|d |d |d g}| dkrtj|dddf |dddf |dddf |dddf f }|d |d |d |d g}|rt|||d\}}td}d}|rtd||||\}}}|r||fS t||||||dS )a  Load the kddcup99 dataset (classification).

    Download it if necessary.

    =================   ====================================
    Classes                                               23
    Samples total                                    4898431
    Dimensionality                                        41
    Features            discrete (int) or continuous (float)
    =================   ====================================

    Read more in the :ref:`User Guide <kddcup99_dataset>`.

    .. versionadded:: 0.18

    Parameters
    ----------
    subset : {'SA', 'SF', 'http', 'smtp'}, default=None
        To return the corresponding classical subsets of kddcup 99.
        If None, return the entire kddcup 99 dataset.

    data_home : str or path-like, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

        .. versionadded:: 0.19

    shuffle : bool, default=False
        Whether to shuffle dataset.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for dataset shuffling and for
        selection of abnormal samples if `subset='SA'`. Pass an int for
        reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : bool, default=False
        If True, returns ``(data, target)`` instead of a Bunch object. See
        below for more information about the `data` and `target` object.

        .. versionadded:: 0.20

    as_frame : bool, default=False
        If `True`, returns a pandas Dataframe for the ``data`` and ``target``
        objects in the `Bunch` returned object; `Bunch` return object will also
        have a ``frame`` member.

        .. versionadded:: 0.24

    n_retries : int, default=3
        Number of retries when HTTP errors are encountered.

        .. versionadded:: 1.5

    delay : float, default=1.0
        Number of seconds between retries.

        .. versionadded:: 1.5

    Returns
    -------
    data : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : {ndarray, dataframe} of shape (494021, 41)
            The data matrix to learn. If `as_frame=True`, `data` will be a
            pandas DataFrame.
        target : {ndarray, series} of shape (494021,)
            The regression target for each sample. If `as_frame=True`, `target`
            will be a pandas Series.
        frame : dataframe of shape (494021, 42)
            Only present when `as_frame=True`. Contains `data` and `target`.
        DESCR : str
            The full description of the dataset.
        feature_names : list
            The names of the dataset columns
        target_names: list
            The names of the target columns

    (data, target) : tuple if ``return_X_y`` is True
        A tuple of two ndarray. The first containing a 2D array of
        shape (n_samples, n_features) with each row representing one
        sample and each column representing the features. The second
        ndarray of shape (n_samples,) containing the target samples.

        .. versionadded:: 0.20
    r   )r   r    r!   r$   r%   r   s   normal.Nr   i1  r   r   r      r      g?F)copy      r   s   https   smtp)r   zkddcup99.rstfetch_kddcup99)datatargetframetarget_namesfeature_namesZDESCR)r   _fetch_brute_kddcup99r/   r0   r3   r2   npZlogical_notshaper	   randintZr_Zc_logastypefloatshuffle_methodr   r   r   )r   r   r
   r   r    r!   r"   r#   r$   r%   kddcup99r/   r0   r3   r2   stZnormal_samplesZnormal_targetsZabnormal_samplesZabnormal_targetsZn_samples_abnormalrZfdescrr1    r@   i/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/sklearn/datasets/_kddcup99.pyr.   6   s   
z

&000
4
4
B

r.   c              
   C   sT  t | d} d}|rt| d| }t}n	t| d| }t}t|d}t|d}	t|}
g dtfdd	d
dtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfd tfd!tfd"tfd#tfd$tfd%tfd&tfd'tfd(tfd)tfd*tfd+tfd,tfd-tfd.tfd/tfd0}d1d2 |D }|d3 }|d4d3 }|
rzt|}t|	}W n t	y } z	t
d5| d6|d4}~ww |rt| td7|j  t||||d8 t|}td9 t||j}t|d:d;}g }| D ]}| }||d<d=d> q6|  td? t| tj|td@}tdAD ]}|d4d4|f  || |d4d4|f< qc|d4d4d4d3f }|d4d4d3f }tj!||dBdC tj!||	dBdC nt
dDt"||||gdES )Fa5  Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    n_retries : int, default=3
        Number of retries when HTTP errors are encountered.

    delay : float, default=1.0
        Number of seconds between retries.

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        target : ndarray of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        feature_names : list
            The names of the dataset columns
        target_names: list
            The names of the target columns
        DESCR : str
            Description of the kddcup99 dataset.

    r(   z-py3Zkddcup99_10r<   Zsamplestargetsduration)Zprotocol_typeZS4)ZserviceZS11)flagZS6Z	src_bytesZ	dst_bytesZlandZwrong_fragmentZurgentZhotZnum_failed_loginsZ	logged_inZnum_compromisedZ
root_shellZsu_attemptedZnum_rootZnum_file_creationsZ
num_shellsZnum_access_filesZnum_outbound_cmdsZis_host_loginZis_guest_logincountZ	srv_countZserror_rateZsrv_serror_rateZrerror_rateZsrv_rerror_rateZsame_srv_rateZdiff_srv_rateZsrv_diff_host_rateZdst_host_countZdst_host_srv_countZdst_host_same_srv_rateZdst_host_diff_srv_rateZdst_host_same_src_port_rateZdst_host_srv_diff_host_rateZdst_host_serror_rateZdst_host_srv_serror_rateZdst_host_rerror_rateZdst_host_srv_rerror_rate)labelsZS16c                 S   s   g | ]}|d  qS )r   r@   ).0cr@   r@   rA   
<listcomp>p  s    z)_fetch_brute_kddcup99.<locals>.<listcomp>Nz7The cache for fetch_kddcup99 is invalid, please delete z! and run the fetch_kddcup99 againzDownloading %s)dirnamer$   r%   zextracting archiver?   )r   mode
 ,zextraction done)dtype*   r   )compressz1Data not found and `download_if_missing` is False)r/   r0   r3   r2   )#r   r   ARCHIVE_10_PERCENTARCHIVEr   intr:   joblibload	ExceptionOSError_mkdirploggerinfor   r   r5   rP   debugr   r   	readlinesdecodeappendreplacesplitcloseosremoveZasarrayobjectranger9   dumpr   )r   r!   r    r$   r%   Z
dir_suffixZ
kddcup_dirarchiveZsamples_pathZtargets_path	availabledtZcolumn_namesr2   r3   XyeZDTarchive_pathfile_ZXylinejr@   r@   rA   r4   
  s  
+

	
 !"#$%&'()*-




*r4   c              
   C   sF   zt |  W dS  ty" } z|jtjkr W Y d}~dS d}~ww )zgEnsure directory d exists (like mkdir -p on Unix)
    No guarantee that the directory is writable.
    N)rd   makedirsrY   errnoEEXIST)drn   r@   r@   rA   rZ     s   rZ   )NTTr&   r'   ))__doc__rt   loggingrd   gzipr   numbersr   r   Zos.pathr   r   rV   numpyr5   utilsr   r	   r
   r;   Zutils._param_validationr   r   r   rN   r   _baser   r   r   r   rT   rS   	getLogger__name__r[   strPathLiker.   r4   rZ   r@   r@   r@   rA   <module>   sn    	

 G
 