B
    Kkd(              !   @   s  d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dlZd dlmZ ddlmZmZmZmZ ddlmZ dd	lmZ d d
lmZ yd dlmZ W n ek
r   eZY nX edd eD Zedd eD Zedd eD ZeeddgB ZdZej rJed dkr&e!ddks*t"e#edd e$d d Z%n
e#eZ%e&ddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5g Z'e#d6Z(i Z)G d7d8 d8e*Z+d9d: Z,G d;d< d<e*Z-G d=d> d>e-Z.G d?d@ d@e/Z0G dAdB dBe*Z1G dCdD dDe*Z2dEdF Z3dS )G    )absolute_importdivisionunicode_literals)	text_typebinary_type)http_clienturllibN)webencodings   )EOFspaceCharactersasciiLettersasciiUppercase)_ReparseException)_utils)StringIO)BytesIOc             C   s   g | ]}| d qS )ascii)encode).0item r   B/tmp/pip-install-gxxfd9b7/pip/pip/_vendor/html5lib/_inputstream.py
<listcomp>   s    r   c             C   s   g | ]}| d qS )r   )r   )r   r   r   r   r   r      s    c             C   s   g | ]}| d qS )r   )r   )r   r   r   r   r   r      s       >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]]z"\uD800-\uDFFF"i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i z[	- -/:-@\[-`{-~]c               @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c             C   s   || _ g | _ddg| _d S )Nr   r   )streambufferposition)selfr   r   r   r   __init__@   s    zBufferedStream.__init__c             C   s@   d}x(| j d | jd  D ]}|t|7 }qW || jd 7 }|S )Nr   r
   )r    r!   len)r"   poschunkr   r   r   tellE   s
    zBufferedStream.tellc             C   sX   ||   kst|}d}x0t| j| |k rH|t| j| 8 }|d7 }qW ||g| _d S )Nr   r
   )_bufferedBytesAssertionErrorr$   r    r!   )r"   r%   offsetir   r   r   seekL   s    zBufferedStream.seekc             C   sT   | j s| |S | jd t| j krF| jd t| j d krF| |S | |S d S )Nr   r
   r   )r    _readStreamr!   r$   _readFromBuffer)r"   bytesr   r   r   readU   s    

zBufferedStream.readc             C   s   t dd | jD S )Nc             S   s   g | ]}t |qS r   )r$   )r   r   r   r   r   r   _   s    z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumr    )r"   r   r   r   r(   ^   s    zBufferedStream._bufferedBytesc             C   s<   | j |}| j| | jd  d7  < t|| jd< |S )Nr   r
   )r   r0   r    appendr!   r$   )r"   r/   datar   r   r   r-   a   s
    zBufferedStream._readStreamc             C   s   |}g }| j d }| j d }x|t| jk r|dkr|dks@t| j| }|t|| krn|}||| g| _ n"t|| }|t|g| _ |d7 }|||||   ||8 }d}qW |r|| | d|S )Nr   r
       )r!   r$   r    r)   r2   r-   join)r"   r/   remainingBytesrvbufferIndexbufferOffsetbufferedDatabytesToReadr   r   r   r.   h   s&    


zBufferedStream._readFromBufferN)__name__
__module____qualname____doc__r#   r'   r,   r0   r(   r-   r.   r   r   r   r   r   9   s   		r   c             K   s   t | tjs(t | tjjr.t | jtjr.d}n&t| drJt | dt	}n
t | t	}|rdd |D }|rvt
d| t| f|S t| f|S d S )NFr0   r   c             S   s   g | ]}| d r|qS )	_encoding)endswith)r   xr   r   r   r      s    z#HTMLInputStream.<locals>.<listcomp>z3Cannot set an encoding with a unicode input, set %r)
isinstancer   HTTPResponser   responseaddbasefphasattrr0   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourcekwargs	isUnicode	encodingsr   r   r   HTMLInputStream   s    

rP   c               @   sp   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dddZdd Zdd ZdddZdd ZdS )rJ   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c             C   sZ   t jsd| _ntddkr$| j| _n| j| _dg| _tddf| _| 	|| _
|   dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        Nu   􏿿r
   r   zutf-8certain)r   supports_lone_surrogatesreportCharacterErrorsr$   characterErrorsUCS4characterErrorsUCS2newLineslookupEncodingcharEncoding
openStream
dataStreamreset)r"   rL   r   r   r   r#      s    
zHTMLUnicodeInputStream.__init__c             C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )r&   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacter)r"   r   r   r   r[      s    zHTMLUnicodeInputStream.resetc             C   s   t |dr|}nt|}|S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r0   )rH   r   )r"   rL   r   r   r   r   rY      s    
z!HTMLUnicodeInputStream.openStreamc             C   sT   | j }|dd|}| j| }|dd|}|dkr@| j| }n||d  }||fS )N
r   r   r
   )r&   countr`   rfindra   )r"   r*   r&   nLinespositionLinelastLinePospositionColumnr   r   r   	_position   s    
z HTMLUnicodeInputStream._positionc             C   s   |  | j\}}|d |fS )z:Returns (line, col) of the current position in the stream.r
   )rj   r^   )r"   linecolr   r   r   r!      s    zHTMLUnicodeInputStream.positionc             C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r
   )r^   r]   	readChunkr   r&   )r"   r^   charr   r   r   rn      s    

zHTMLUnicodeInputStream.charNc             C   s   |d kr| j }| | j\| _| _d| _d| _d| _| j|}| j	rX| j	| }d | _	n|s`dS t
|dkrt|d }|dksd|  krdkrn n|d | _	|d d }| jr| | |d	d
}|dd
}|| _t
|| _dS )Nr\   r   Fr
   r      i   i  z
rc   T)_defaultChunkSizerj   r]   r`   ra   r&   r^   rZ   r0   rb   r$   ordrS   replace)r"   r]   r3   lastvr   r   r   rm      s0    
 


z HTMLUnicodeInputStream.readChunkc             C   s,   x&t tt|D ]}| jd qW d S )Nzinvalid-codepoint)ranger$   invalid_unicode_refindallr_   r2   )r"   r3   _r   r   r   rT   %  s    z*HTMLUnicodeInputStream.characterErrorsUCS4c             C   s   d}xt |D ]}|rqt| }| }t|||d  rtt|||d  }|tkrn| j	
d d}q|dkr|dkr|t|d kr| j	
d qd}| j	
d qW d S )NF   zinvalid-codepointTi   i  r
   )rv   finditerrr   groupstartr   isSurrogatePairsurrogatePairToCodepointnon_bmp_invalid_codepointsr_   r2   r$   )r"   r3   skipmatch	codepointr%   char_valr   r   r   rU   )  s     z*HTMLUnicodeInputStream.characterErrorsUCS2Fc       
      C   s  yt ||f }W nl tk
r|   x|D ]}t|dk s&tq&W ddd |D }|s^d| }td|  }t ||f< Y nX g }x||| j| j	}|dkr| j	| j
krP n0| }|| j
kr|| j| j	|  || _	P || j| j	d  |  sP qW d|}	|	S )z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
           r\   c             S   s   g | ]}d t | qS )z\x%02x)rr   )r   cr   r   r   r   N  s    z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorrr   r)   r5   recompiler   r&   r^   r]   endr2   rm   )
r"   
charactersoppositecharsr   regexr7   mr   rr   r   r   
charsUntil@  s2    
 

z!HTMLUnicodeInputStream.charsUntilc             C   sT   |d k	rP| j dkr.|| j | _|  jd7  _n"|  j d8  _ | j| j  |ksPtd S )Nr   r
   )r^   r&   r]   r)   )r"   rn   r   r   r   ungeto  s    
zHTMLUnicodeInputStream.unget)N)F)r<   r=   r>   r?   rq   r#   r[   rY   rj   r!   rn   rm   rT   rU   r   r   r   r   r   r   rJ      s    
&
/rJ   c               @   sL   e Zd ZdZdddZdd Zd	d
 ZdddZdd Zdd Z	dd Z
dS )rK   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    Nwindows-1252Tc             C   sn   |  || _t| | j d| _d| _|| _|| _|| _|| _	|| _
| || _| jd dk	sbt|   dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        i   d   r   N)rY   	rawStreamrJ   r#   numBytesMetanumBytesChardetoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermineEncodingrX   r)   r[   )r"   rL   r   r   r   r   r   
useChardetr   r   r   r#     s    zHTMLBinaryInputStream.__init__c             C   s&   | j d j| jd| _t|  d S )Nr   rs   )rX   
codec_infostreamreaderr   rZ   rJ   r[   )r"   r   r   r   r[     s    zHTMLBinaryInputStream.resetc             C   sD   t |dr|}nt|}y||  W n   t|}Y nX |S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r0   )rH   r   r,   r'   r   )r"   rL   r   r   r   r   rY     s    
z HTMLBinaryInputStream.openStreamc             C   s  |   df}|d d k	r|S t| jdf}|d d k	r:|S t| jdf}|d d k	rX|S |  df}|d d k	rt|S t| jdf}|d d k	r|d jds|S t| jdf}|d d k	r|S |rryddl	m
} W n tk
r   Y nX g }| }xF|js<| j| j}t|tst|s&P || || qW |  t|jd }| jd |d k	rr|dfS t| jdf}|d d k	r|S tddfS )NrQ   r   	tentativezutf-16)UniversalDetectorencodingzwindows-1252)	detectBOMrW   r   r   detectEncodingMetar   name
startswithr   %pip._vendor.chardet.universaldetectorr   ImportErrordoner   r0   r   rC   r/   r)   r2   feedcloseresultr,   r   )r"   chardetrX   r   buffersdetectorr    r   r   r   r   r     sR    


z'HTMLBinaryInputStream.determineEncodingc             C   s   | j d dkstt|}|d kr&d S |jdkrFtd}|d k	stnT|| j d krf| j d df| _ n4| jd |df| _ |   td| j d |f d S )Nr
   rQ   )zutf-16bezutf-16lezutf-8r   zEncoding changed from %s to %s)rX   r)   rW   r   r   r,   r[   r   )r"   newEncodingr   r   r   changeEncoding  s    

z$HTMLBinaryInputStream.changeEncodingc          
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	s<t
||dd }d}|s~||}d}|s~||dd	 }d	}|r| j| t|S | jd
 dS dS )zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return Nonezutf-8zutf-16lezutf-16bezutf-32lezutf-32be   N   ry   r   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r0   rC   r/   r)   getr,   rW   )r"   bomDictstringr   r,   r   r   r   r     s$    
zHTMLBinaryInputStream.detectBOMc             C   sV   | j | j}t|tstt|}| j d | }|dk	rR|j	dkrRt
d}|S )z9Report the encoding declared by the meta element
        r   N)zutf-16bezutf-16lezutf-8)r   r0   r   rC   r/   r)   EncodingParserr,   getEncodingr   rW   )r"   r    parserr   r   r   r   r   9  s    z(HTMLBinaryInputStream.detectEncodingMeta)NNNNr   T)T)r<   r=   r>   r?   r#   r[   rY   r   r   r   r   r   r   r   r   rK     s     
(
>"rK   c               @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeee
Zdd ZeeZefddZdd Zdd Zdd ZdS )EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc             C   s   t |tstt| | S )N)rC   r/   r)   __new__lower)r"   valuer   r   r   r   L  s    zEncodingBytes.__new__c             C   s
   d| _ d S )Nr   )rj   )r"   r   r   r   r   r#   P  s    zEncodingBytes.__init__c             C   s   | S )Nr   )r"   r   r   r   __iter__T  s    zEncodingBytes.__iter__c             C   s>   | j d  }| _ |t| kr"tn|dk r.t| ||d  S )Nr
   r   )rj   r$   StopIterationrI   )r"   pr   r   r   __next__W  s    zEncodingBytes.__next__c             C   s   |   S )N)r   )r"   r   r   r   next_  s    zEncodingBytes.nextc             C   sB   | j }|t| krtn|dk r$t|d  | _ }| ||d  S )Nr   r
   )rj   r$   r   rI   )r"   r   r   r   r   previousc  s    zEncodingBytes.previousc             C   s   | j t| krt|| _ d S )N)rj   r$   r   )r"   r!   r   r   r   setPositionl  s    zEncodingBytes.setPositionc             C   s*   | j t| krt| j dkr"| j S d S d S )Nr   )rj   r$   r   )r"   r   r   r   getPositionq  s
    
zEncodingBytes.getPositionc             C   s   | | j | j d  S )Nr
   )r!   )r"   r   r   r   getCurrentByte{  s    zEncodingBytes.getCurrentBytec             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _dS )zSkip past a list of charactersr
   N)r!   r$   rj   )r"   r   r   r   r   r   r   r     s    zEncodingBytes.skipc             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _d S )Nr
   )r!   r$   rj   )r"   r   r   r   r   r   r   	skipUntil  s    zEncodingBytes.skipUntilc             C   s>   | j }| ||t|  }||}|r:|  j t|7  _ |S )zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r!   r$   r   )r"   r/   r   r3   r7   r   r   r   
matchBytes  s    
zEncodingBytes.matchBytesc             C   sR   | | j d |}|dkrJ| jdkr,d| _|  j|t| d 7  _dS tdS )zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchNr   r   r
   T)r!   findrj   r$   r   )r"   r/   newPositionr   r   r   jumpTo  s    
zEncodingBytes.jumpToN)r<   r=   r>   r?   r   r#   r   r   r   r   r   r   propertyr!   r   currentBytespaceCharactersBytesr   r   r   r   r   r   r   r   r   H  s    	
r   c               @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z?Mini parser for detecting character encoding from meta elementsc             C   s   t || _d| _dS )z3string - the data to work on for encoding detectionN)r   r3   r   )r"   r3   r   r   r   r#     s    
zEncodingParser.__init__c          
   C   s   d| j fd| jfd| jfd| jfd| jfd| jff}x^| jD ]T}d}xD|D ]<\}}| j|rJy| }P W qJ tk
r   d}P Y qJX qJW |s<P q<W | jS )	Ns   <!--s   <metas   </s   <!s   <?r   TF)	handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr3   r   r   r   )r"   methodDispatchrx   keepParsingkeymethodr   r   r   r     s&    zEncodingParser.getEncodingc             C   s   | j dS )zSkip over commentss   -->)r3   r   )r"   r   r   r   r     s    zEncodingParser.handleCommentc             C   s   | j jtkrdS d}d }x|  }|d kr.dS |d dkr^|d dk}|r|d k	r|| _dS q|d dkr|d }t|}|d k	r|| _dS q|d dkrtt|d }| }|d k	rt|}|d k	r|r|| _dS |}qW d S )	NTFr   s
   http-equivr
   s   content-types   charsets   content)	r3   r   r   getAttributer   rW   ContentAttrParserr   parse)r"   	hasPragmapendingEncodingattrtentativeEncodingcodeccontentParserr   r   r   r     s:    zEncodingParser.handleMetac             C   s
   |  dS )NF)handlePossibleTag)r"   r   r   r   r     s    z%EncodingParser.handlePossibleStartTagc             C   s   t | j | dS )NT)r   r3   r   )r"   r   r   r   r     s    
z#EncodingParser.handlePossibleEndTagc             C   sf   | j }|jtkr(|r$|  |   dS |t}|dkrD|  n|  }x|d k	r`|  }qNW dS )NTr   )r3   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )r"   endTagr3   r   r   r   r   r   r     s    



z EncodingParser.handlePossibleTagc             C   s   | j dS )Nr   )r3   r   )r"   r   r   r   r     s    zEncodingParser.handleOtherc             C   s   | j }|ttdgB }|dks2t|dks2t|dkr>dS g }g }xt|dkrX|rXP nX|tkrl| }P nD|dkrd|dfS |tkr||	  n|dkrdS || t
|}qHW |dkr|  d|dfS t
| | }|dkrR|}xt
|}||kr(t
| d|d|fS |tkrB||	  q|| qW nJ|d	krjd|dfS |tkr||	  n|dkrdS || x^t
|}|tkrd|d|fS |tkr||	  n|dkrdS || qW dS )
z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr
   )r   N   =)r   r   r4   )   '   "r   )r3   r   r   	frozensetr$   r)   r5   asciiUppercaseBytesr2   r   r   r   r   )r"   r3   r   attrName	attrValue	quoteCharr   r   r   r     sh    










zEncodingParser.getAttributeN)r<   r=   r>   r?   r#   r   r   r   r   r   r   r   r   r   r   r   r   r     s   $r   c               @   s   e Zd Zdd Zdd ZdS )r   c             C   s   t |tst|| _d S )N)rC   r/   r)   r3   )r"   r3   r   r   r   r#   f  s    zContentAttrParser.__init__c             C   s  y| j d | j  jd7  _| j   | j jdks8d S | j  jd7  _| j   | j jdkr| j j}| j  jd7  _| j j}| j |r| j || j j S d S nF| j j}y| j t | j || j j S  tk
r   | j |d  S X W n tk
r    d S X d S )Ns   charsetr
   r   )r   r   )r3   r   r!   r   r   r   r   r   )r"   	quoteMarkoldPositionr   r   r   r   j  s.    

zContentAttrParser.parseN)r<   r=   r>   r#   r   r   r   r   r   r   e  s   r   c             C   s`   t | tr.y| d} W n tk
r,   dS X | dk	rXy
t| S  tk
rT   dS X ndS dS )z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   N)rC   r   decodeUnicodeDecodeErrorr	   lookupAttributeError)r   r   r   r   rW     s    

rW   )4
__future__r   r   r   pip._vendor.sixr   r   Zpip._vendor.six.movesr   r   r   r   pip._vendorr	   	constantsr   r   r   r   r   r\   r   ior   r   r   r   r   r   r   r   invalid_unicode_no_surrogaterR   rd   r)   r   evalrv   setr   ascii_punctuation_rer   objectr   rP   rJ   rK   r/   r   r   r   rW   r   r   r   r   <module>   sX   
"








J g Ih 6'