Fixed <script>/<style> element parsing.
authorFrederic Jolliton <frederic@jolliton.com>
Wed, 7 Sep 2005 16:27:51 +0000 (16:27 +0000)
committerFrederic Jolliton <frederic@jolliton.com>
Wed, 7 Sep 2005 16:27:51 +0000 (16:27 +0000)
 * Fixed HTML parser to wait correct ending tag for <script> and <style>
   element (previously we were stopping at the first "</" found.)
git-archimport-id: frederic@jolliton.com--2005-main/tx--main--0.1--patch-6

htmlparser.py

index cb10ad7..1e4053c 100644 (file)
 # with complex syntax.
 #
 
+#
+# TODO:
+#
+# [ ] For incremental parsing, keep a pointer to the maximum position
+#     from which we should start again to match (for example, when
+#     looking for </script>, we search </. We can remember where the
+#     latest </ was found and start again from this point next time
+#     new characters are added.)
+#
+
 __all__ = [ 'HTMLParser' , 'HTMLParseError' ]
 
 import re
@@ -196,11 +206,18 @@ class HTMLParser( object ) :
                #
                b , p = self.__buffer , self.__pos
                if self.__waitTag :
-                       e = b.find( '</' , p )
-                       if e != -1 :
-                               self.handle_data( b[ p : e ] )
-                               p = e
-                               self.__waitTag = None
+                       wt = self.__waitTag.lower()
+                       e = p
+                       while 1 :
+                               e = b.find( '</' , e )
+                               if e == -1 :
+                                       break
+                               if b[ e + 2 : e + 2 + len( wt ) ].lower() == wt :
+                                       self.handle_data( b[ p : e ] )
+                                       p = e
+                                       self.__waitTag = None
+                                       break
+                               e += 2
                else :
                        while p < len( b ) :
                                #print '%4d' % p , shortenText( b[ p : ] , 30 )
@@ -227,12 +244,19 @@ class HTMLParser( object ) :
                                                        self.__processTag( tag )
                                                        p = e
                                                        if self.__waitTag :
-                                                               e = b.find( '</' , p )
-                                                               if e != -1 :
-                                                                       self.handle_data( b[ p : e ] )
-                                                                       p = e
-                                                                       self.__waitTag = None
-                                                               else :
+                                                               wt = self.__waitTag.lower()
+                                                               e = p
+                                                               while 1 :
+                                                                       e = b.find( '</' , e )
+                                                                       if e == -1 :
+                                                                               break
+                                                                       if b[ e + 2 : e + 2 + len( wt ) ].lower() == wt :
+                                                                               self.handle_data( b[ p : e ] )
+                                                                               p = e
+                                                                               self.__waitTag = None
+                                                                               break
+                                                                       e += 2
+                                                               if e == -1 :
                                                                        break
                                        elif b.startswith( '<![CDATA[' , p ) :
                                                e = b.find( ']]>' , p + 9 )