1 # -*- coding: utf-8 -*-
3 # htmlparser.py - An error tolerant HTML parser.
4 # Copyright (C) 2004,2005 Frédéric Jolliton <frederic@jolliton.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 # This module can replace HTMLParser Python parser.
23 # It never throw an exception even for worst HTML document.
24 # However, it is not able to parse SGML declaration statement
25 # with complex syntax.
28 __all__ = [ 'HTMLParser' , 'HTMLParseError' ]
31 from htmlentitydefs import entitydefs
33 from misc import shortenText
35 reEndOfData = re.compile(
36 '[&<]' # Either & or <
38 reCheckTag = re.compile(
41 reTagName = re.compile(
42 '^[a-z][-:a-z0-9]*' , re.I
44 reEndOfTag = re.compile( # <CHAR*> (<QUOTED> <CHAR*>)* '>'
48 r"'[^']*'" # double-quote string
50 r'"[^"]*"' # single-quote string
57 '([a-z_][a-z0-9._-]*(?::[a-z_][a-z0-9._-]*)?)' # name
59 r'\s*=\s*' # spaces then =
61 '[^\'"\\s=]+' # anything but spaces and quote characters
63 '"[^"]*"' # double-quote string
65 "'[^']*'" # single-quote string
69 reEntity = re.compile(
72 r'#(\d+|x[\da-f]+)' # numeric entity (  or  )
74 '([a-z]+)' # named entity ( )
79 # Not used. Only for compatibility with HTMLParser python module.
80 class HTMLParseError : pass
90 def _decodeAttr( s ) :
93 >>> _decodeAttr( 'bar's & baz Ā' )
97 r = reEntity.search( s )
103 while r is not None :
104 result.append( s[ p : r.start( 0 ) ] )
106 if r.group( 1 ) is not None :
108 result.append( unichr( int( r.group( 1 ) ) ) )
109 except OverflowError :
110 result.append( r.group( 0 ) )
112 e = defaultEntities.get( r.group( 2 ).lower() )
114 result.append( r.group( 0 ) )
117 r = reEntity.search( s , p )
118 result.append( s[ p : ] )
119 return ''.join( result )
121 def _parseAttr( s ) :
124 >>> _parseAttr( 'foo bar=baz x="y" z=\'quux\'' )
125 [('foo', None), ('bar', 'baz'), ('x', 'y'), ('z', 'quux')]
131 r = reAttr.search( s , p )
134 k , eq , v = r.groups()
136 if v[ 0 ] == "'" or v[ 0 ] == '"' :
141 attrs.append( ( k , v ) )
145 class HTMLParser( object ) :
147 __slots__ = [ '__buffer' , '__pos' , '__cdataTags' , '__waitTag' ]
149 def __init__( self ) :
154 # Tags with CDATA type
156 self.__cdataTags = set( ( 'script' , 'style' ) )
157 self.__waitTag = None
159 def feed( self , s ) :
161 self.__buffer = self.__buffer[ self.__pos : ] + s
167 self.__process( finalize = True )
169 def handle_data( self , data ) : pass
170 def handle_starttag( self , name , attr ) : pass
171 def handle_endtag( self , name ) : pass
172 def handle_charref( self , name ) : pass
173 def handle_entityref( self , name ) : pass
174 def handle_comment( self , data ) : pass
175 def handle_decl( self , data ) : pass
176 def handle_pi( self , data ) : pass
177 def handle_startendtag( self , name , attr ) :
179 self.handle_starttag( name , attr )
180 self.handle_endtag( name )
186 def __process( self , finalize = False ) :
189 # 1-letter variable used here:
192 # p = current position
197 b , p = self.__buffer , self.__pos
199 e = b.find( '</' , p )
201 self.handle_data( b[ p : e ] )
203 self.__waitTag = None
206 #print '%4d' % p , shortenText( b[ p : ] , 30 )
208 if b.startswith( '<?' , p ) :
209 e = b.find( '?>' , p + 2 )
216 elif not b.startswith( '<!' , p ) :
217 r = reEndOfTag.match( b , p + 1 )
222 rn = reCheckTag.match( tag , 1 )
224 self.handle_data( b[ p ] )
227 self.__processTag( tag )
230 e = b.find( '</' , p )
232 self.handle_data( b[ p : e ] )
234 self.__waitTag = None
237 elif b.startswith( '<![CDATA[' , p ) :
238 e = b.find( ']]>' , p + 9 )
242 cdata = b[ p + 9 : e ]
244 self.handle_data( cdata )
245 elif b.startswith( '<!--' , p ) :
246 e , s = b.find( '-->' , p + 4 ) , 3
248 e , s = b.find( '->' , p + 4 ) , 2
250 e , s = b.find( '>' , p + 4 ) , 1
251 if e == -1 : # Unterminated comment
254 comment = b[ p + 4 : e ]
256 self.handle_comment( comment )
257 else : # b.startswith( '<!' )
259 e = b.find( '>' , p + 2 ) # We only handle "simple" declaration.
263 self.handle_decl( b[ p + 2 : e ] )
266 r = reEntity.match( b , p )
268 if len( b ) - p > 3 :
269 self.handle_data( '&' )
274 if r.group( 1 ) is not None :
276 if not finalize and not ref.endswith( ';' ) and r.end( 0 ) == len( b ) :
278 self.handle_charref( ref )
281 if not finalize and not ref.endswith( ';' ) and r.end( 0 ) == len( b ) :
283 self.handle_entityref( ref )
286 r = reEndOfData.search( b , p )
289 break # wait for end of data
296 self.handle_data( data )
299 self.handle_data( b[ p : ] )
301 self.__buffer , self.__pos = b , p
303 def __processTag( self , tag ) :
305 if tag.startswith( '<!' ) :
306 self.handle_decl( tag[ 2 : -1 ] )
308 tagContents = tag[ 1 : -1 ]
309 tagType = 0 # 0: start, 1: end, 2: empty
310 if tagContents.startswith( '/' ) :
312 tagContents = tagContents[ 1 : ]
313 elif tagContents.endswith( '/' ) : # and ' ' not in tagContents :
315 tagContents = tagContents[ : -1 ]
316 r = reTagName.match( tagContents )
319 name , attr = tagContents[ : e ] , tagContents[ e : ]
320 attr = _parseAttr( attr )
322 self.handle_starttag( name , attr )
324 if name in self.__cdataTags :
325 self.__waitTag = name # Start of CDATA element
327 self.handle_endtag( name )
329 self.handle_startendtag( name , attr )
333 self.handle_data( tag )
335 class HTMLParserDebug( HTMLParser ) :
337 def handle_data( self , data ) :
339 print 'data(%r)' % data
341 def handle_starttag( self , name , attr ) :
343 print 'starttag(%r,%r)' % ( name , attr )
345 def handle_endtag( self , name ) :
347 print 'endtag(%r)' % name
349 def handle_startendtag( self , name , attr ) :
351 print 'startendtag(%r,%r)...' % ( name , attr )
352 HTMLParser.handle_startendtag( self , name , attr )
354 def handle_charref( self , name ) :
356 print 'charref(%r)' % name
358 def handle_entityref( self , name ) :
360 print 'entityref(%r)' % name
362 def handle_comment( self , data ) :
364 print 'comment(%r)' % data
366 def handle_decl( self , data ) :
368 print 'decl(%r)' % data
370 def handle_pi( self , data ) :
372 print 'pi(%r)' % data