1 # -*- coding: utf-8 -*-
3 # htmlparser.py - An error tolerant HTML parser.
4 # Copyright (C) 2004,2005 Frédéric Jolliton <frederic@jolliton.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 # This module can replace HTMLParser Python parser.
23 # It never throw an exception even for worst HTML document.
24 # However, it is not able to parse SGML declaration statement
25 # with complex syntax.
31 # [ ] For incremental parsing, keep a pointer to the maximum position
32 # from which we should start again to match (for example, when
33 # looking for </script>, we search </. We can remember where the
34 # latest </ was found and start again from this point next time
35 # new characters are added.)
38 __all__ = [ 'HTMLParser' , 'HTMLParseError' ]
41 from htmlentitydefs import entitydefs
43 from misc import shortenText
45 reEndOfData = re.compile(
46 '[&<]' # Either & or <
48 reCheckTag = re.compile(
51 reTagName = re.compile(
52 '^[a-z_:][a-z0-9._:-]*' , re.I
54 reEndOfTag = re.compile( # <CHAR*> (<QUOTED> <CHAR*>)* '>'
58 r"'[^']*'" # double-quote string
60 r'"[^"]*"' # single-quote string
67 '([a-z_:][a-z0-9._:-]*)' # name
69 r'\s*=\s*' # spaces then =
71 '[^\'"\\s]+' # anything but spaces and quote characters
73 '"[^"]*"' # double-quote string
75 "'[^']*'" # single-quote string
79 reEntity = re.compile(
82 r'#(\d+|x[\da-f]+)' # numeric entity (  or  )
84 '([a-z]+)' # named entity ( )
89 # Not used. Only for compatibility with HTMLParser python module.
90 class HTMLParseError : pass
100 def _decodeAttr( s ) :
103 >>> _decodeAttr( 'bar's & baz Ā' )
104 u"bar's & baz \u0100"
107 r = reEntity.search( s )
113 while r is not None :
114 result.append( s[ p : r.start( 0 ) ] )
116 if r.group( 1 ) is not None :
118 result.append( unichr( int( r.group( 1 ) ) ) )
119 except OverflowError :
120 result.append( r.group( 0 ) )
122 e = defaultEntities.get( r.group( 2 ).lower() )
124 result.append( r.group( 0 ) )
127 r = reEntity.search( s , p )
128 result.append( s[ p : ] )
129 return ''.join( result )
131 def _parseAttr( s ) :
134 >>> _parseAttr( 'foo bar=baz x="y" z=\'quux\'' )
135 [('foo', None), ('bar', 'baz'), ('x', 'y'), ('z', 'quux')]
141 r = reAttr.search( s , p )
144 k , eq , v = r.groups()
146 if v[ 0 ] == "'" or v[ 0 ] == '"' :
151 attrs.append( ( k , v ) )
155 class HTMLParser( object ) :
157 __slots__ = [ '__buffer' , '__pos' , '__cdataTags' , '__waitTag' ]
159 def __init__( self ) :
165 # Tags with CDATA type
167 self.__cdataTags = set( ( 'script' , 'style' ) )
168 self.__waitTag = None
170 def feed( self , s ) :
172 self.__buffer = self.__buffer[ self.__pos : ] + s
178 self.__process( finalize = True )
180 def handle_data( self , data ) : pass
181 def handle_starttag( self , name , attr ) : pass
182 def handle_endtag( self , name ) : pass
183 def handle_charref( self , name ) : pass
184 def handle_entityref( self , name ) : pass
185 def handle_comment( self , data ) : pass
186 def handle_decl( self , data ) : pass
187 def handle_pi( self , data ) : pass
188 def handle_startendtag( self , name , attr ) :
190 self.handle_starttag( name , attr )
191 self.handle_endtag( name )
197 def __process( self , finalize = False ) :
200 # 1-letter variable used here:
203 # p = current position
208 b , p = self.__buffer , self.__pos
210 wt = self.__waitTag.lower()
213 e = b.find( '</' , e )
216 if b[ e + 2 : e + 2 + len( wt ) ].lower() == wt :
217 self.handle_data( b[ p : e ] )
219 self.__waitTag = None
224 #print '%4d' % p , shortenText( b[ p : ] , 30 )
226 if b.startswith( '<?' , p ) :
227 e = b.find( '?>' , p + 2 )
234 elif not b.startswith( '<!' , p ) :
235 r = reEndOfTag.match( b , p + 1 )
240 rn = reCheckTag.match( tag , 1 )
242 self.handle_data( b[ p ] )
245 self.__processTag( tag )
248 wt = self.__waitTag.lower()
251 e = b.find( '</' , e )
254 if b[ e + 2 : e + 2 + len( wt ) ].lower() == wt :
255 self.handle_data( b[ p : e ] )
257 self.__waitTag = None
262 elif b.startswith( '<![CDATA[' , p ) :
263 e = b.find( ']]>' , p + 9 )
267 cdata = b[ p + 9 : e ]
269 self.handle_data( cdata )
270 elif b.startswith( '<!--' , p ) :
271 e , s = b.find( '-->' , p + 4 ) , 3
273 e , s = b.find( '->' , p + 4 ) , 2
275 e , s = b.find( '>' , p + 4 ) , 1
276 if e == -1 : # Unterminated comment
279 comment = b[ p + 4 : e ]
281 self.handle_comment( comment )
282 else : # b.startswith( '<!' )
284 e = b.find( '>' , p + 2 ) # We only handle "simple" declaration.
288 self.handle_decl( b[ p + 2 : e ] )
291 r = reEntity.match( b , p )
293 if len( b ) - p > 3 :
294 self.handle_data( '&' )
299 if r.group( 1 ) is not None :
301 if not finalize and not ref.endswith( ';' ) and r.end( 0 ) == len( b ) :
303 self.handle_charref( ref )
306 if not finalize and not ref.endswith( ';' ) and r.end( 0 ) == len( b ) :
308 self.handle_entityref( ref )
311 r = reEndOfData.search( b , p )
314 break # wait for end of data
321 self.handle_data( data )
324 self.handle_data( b[ p : ] )
326 self.__buffer , self.__pos = b , p
328 def __processTag( self , tag ) :
330 if tag.startswith( '<!' ) :
331 self.handle_decl( tag[ 2 : -1 ] )
333 tagContents = tag[ 1 : -1 ]
334 tagType = 0 # 0: start, 1: end, 2: empty
335 if tagContents.startswith( '/' ) :
337 tagContents = tagContents[ 1 : ]
338 elif tagContents.endswith( '/' ) : # and ' ' not in tagContents :
340 tagContents = tagContents[ : -1 ]
341 r = reTagName.match( tagContents )
344 name , attr = tagContents[ : e ] , tagContents[ e : ]
345 attr = _parseAttr( attr )
347 self.handle_starttag( name , attr )
349 if name in self.__cdataTags :
350 self.__waitTag = name # Start of CDATA element
352 self.handle_endtag( name )
354 self.handle_startendtag( name , attr )
358 self.handle_data( tag )
360 class HTMLParserDebug( HTMLParser ) :
362 def handle_data( self , data ) :
364 print 'data(%r)' % data
366 def handle_starttag( self , name , attr ) :
368 print 'starttag(%r,%r)' % ( name , attr )
370 def handle_endtag( self , name ) :
372 print 'endtag(%r)' % name
374 def handle_startendtag( self , name , attr ) :
376 print 'startendtag(%r,%r)...' % ( name , attr )
377 HTMLParser.handle_startendtag( self , name , attr )
379 def handle_charref( self , name ) :
381 print 'charref(%r)' % name
383 def handle_entityref( self , name ) :
385 print 'entityref(%r)' % name
387 def handle_comment( self , data ) :
389 print 'comment(%r)' % data
391 def handle_decl( self , data ) :
393 print 'decl(%r)' % data
395 def handle_pi( self , data ) :
397 print 'pi(%r)' % data