# Exception
'NotReached' ,
# Functions
+ 'decodeData' ,
'iterSingle' ,
'identity' ,
'constantly' ,
]
import time
+import re
class NotReached( Exception ) :
characters to skip to get beginning of real data.'''
if isinstance( s , unicode ) :
- return 'UNICODE'
+ return 'UNICODE' , 0
else :
for encoding , name in encodingsBom :
if s[ : len( encoding ) ] == encoding :
return name , 0
return None , 0
+rtXmlDeclVersionInfo = r'''(?:\s+version=(?P<version>(?:'[^']*'|"[^"]*")))'''
+rtXmlDeclEncodingDecl = r'''(?:\s+encoding=(?P<encoding>(?:'[^']*'|"[^"]*")))'''
+rtXmlDeclStandaloneDecl = r'''(?:\s+standalone=(?P<standalone>(?:'[^']*'|"[^"]*")))'''
+reXmlDecl = re.compile( r'<\?xml(?:%s|%s|%s)*\s*\??>' \
+ % ( rtXmlDeclVersionInfo ,
+ rtXmlDeclEncodingDecl ,
+ rtXmlDeclStandaloneDecl ) )
+def getXmlEncoding( data ) :
+
+ r = reXmlDecl.search( data )
+ if r is not None :
+ enc = r.groupdict()[ 'encoding' ]
+ if enc is not None :
+ enc = enc[ 1 : -1 ].lower()
+ return enc
+
+reLookupMeta = re.compile( r'<meta\s+([^>]+)>' , re.I )
+reAttributes = re.compile( '''(http-equiv|content)=('[^']*'|"[^"]*")''' , re.I )
+def getHtmlEncoding( data ) :
+
+ p = 0
+ while p < len( data ) :
+ r = reLookupMeta.search( data , p )
+ if r is None :
+ break
+ attributes = r.group( 1 )
+ attributes = dict( ( name.lower() , value[ 1 : -1 ].lower() )
+ for name , value
+ in reAttributes.findall( attributes ) )
+ if attributes.get( 'http-equiv' ) == 'content-type' :
+ ct = attributes.get( 'content' )
+ if ct is not None :
+ for item in ct.split( ';' )[ 1 : ] :
+ item = item.split( '=' )
+ if len( item ) == 2 and item[ 0 ].strip().lower() == 'charset' :
+ return item[ 1 ].strip().lower()
+ break
+ p = r.end( 0 )
+
+encodingName = {
+ 'UTF-32/BE' : None ,
+ 'UTF-32/LE' : None ,
+ 'UTF-32/2143' : None ,
+ 'UTF-32/3412' : None ,
+ 'UTF-16/BE' : 'utf_16_be' ,
+ 'UTF-16/LE' : 'utf_16_le' ,
+ 'UTF-8' : 'utf_8' ,
+ '32BIT/BE' : None ,
+ '32BIT/LE' : None ,
+ '32BIT/2143' : None ,
+ '32BIT/3412' : None ,
+ 'EBCDIC' : None
+ }
+
+def decodeData( data ) :
+
+ '''Decode XML/HTML text in 'data' and return an Unicode string,
+ guessing encoding from various way (BOM header, XML declaration or
+ ContentType META element in HTML header.)'''
+
+ defaultEncoding = 'iso-8859-1'
+ encoding , skip = guessXmlCharacterEncoding( data )
+ if encoding == 'UNICODE' :
+ data = data[ skip : ] # Do nothing, already Unicode.
+ elif encoding in encodingName :
+ name = encodingName[ encoding ]
+ if name is None :
+ raise Error( 'Unsupported encoding %s' % encoding )
+ data = data[ skip : ].decode( encoding )
+ elif encoding == '16BIT/BE' :
+ data = decode16bitBe( data[ skip : ] )
+ elif encoding == '16BIT/LE' :
+ data = decode16bitLe( data[ skip : ] )
+ elif encoding in [ '8BIT' , 'UNKNOWN' ] :
+ data = data[ skip : ]
+ encoding = getXmlEncoding( data )
+ if encoding is not None :
+ data = data.decode( encoding )
+ else :
+ encoding = getHtmlEncoding( data ) or defaultEncoding
+ data = data.decode( encoding )
+ else :
+ raise Error( 'Unexpected encoding %s' % encoding )
+ return data
+
def iterSingle( o ) :
'''Iterate over singleton 'o'.'''
import htmltree
from xpath import XPath
import xpathparser
-from misc import guessXmlCharacterEncoding
from sequence import Sequence
from nodes import Node, Document
from error import Error
else :
print item
-rtXmlDeclVersionInfo = r'''(?:\s+version=(?P<version>(?:'[^']*'|"[^"]*")))'''
-rtXmlDeclEncodingDecl = r'''(?:\s+encoding=(?P<encoding>(?:'[^']*'|"[^"]*")))'''
-rtXmlDeclStandaloneDecl = r'''(?:\s+standalone=(?P<standalone>(?:'[^']*'|"[^"]*")))'''
-reXmlDecl = re.compile( r'<\?xml(?:%s|%s|%s)*\s*\??>' \
- % ( rtXmlDeclVersionInfo ,
- rtXmlDeclEncodingDecl ,
- rtXmlDeclStandaloneDecl ) )
-def decodeDocument( txt ) :
-
- if not isinstance( txt , unicode ) :
- enc , skip = guessXmlCharacterEncoding( txt[ : 4 ] )
- dec = 'utf_8'
- if enc == 'UTF-8' :
- dec = 'utf_8'
- elif enc == '8BIT' :
- r = reXmlDecl.search( txt )
- if r is not None :
- enc = r.groupdict()[ 'encoding' ]
- if enc is not None :
- enc = enc[ 1 : -1 ].lower()
- if enc.startswith( 'iso-8859-' ) :
- dec = enc
- elif enc in ( 'utf8' , 'utf-8' ) :
- dec = 'utf_8'
- else :
- dec = None
- else :
- dec = None
- if dec is None :
- dec = 'utf_8'
- txt = txt[ skip : ]
- try :
- txt = txt.decode( dec )
- except UnicodeDecodeError :
- # fallback to ISO-8859-1
- txt = txt.decode( 'iso-8859-1' )
- return txt
-
def readDoc( uri ) :
txt = urllib.urlopen( uri ).read()
- txt = decodeDocument( txt )
return htmltree.parse( txt )
def resetUserAgent( env ) :