3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it. The BeautifulSoup class
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
44 from __future__ import generators
46 __author__ = "Leonard Richardson (leonardr@segfault.org)"
48 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
51 from sgmllib import SGMLParser, SGMLParseError
57 from htmlentitydefs import name2codepoint
61 #This hack makes Beautiful Soup able to parse XML with namespaces
62 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
64 DEFAULT_OUTPUT_ENCODING = "utf-8"
66 # First, the classes that represent markup elements.
69 """Contains the navigational information for some part of the page
70 (either a tag or a piece of text)"""
72 def setup(self, parent=None, previous=None):
73 """Sets up the initial relations between this element and
76 self.previous = previous
78 self.previousSibling = None
79 self.nextSibling = None
80 if self.parent and self.parent.contents:
81 self.previousSibling = self.parent.contents[-1]
82 self.previousSibling.nextSibling = self
84 def replaceWith(self, replaceWith):
85 oldParent = self.parent
86 myIndex = self.parent.contents.index(self)
87 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88 # We're replacing this element with one of its siblings.
89 index = self.parent.contents.index(replaceWith)
90 if index and index < myIndex:
91 # Furthermore, it comes before this element. That
92 # means that when we extract it, the index of this
93 # element will change.
96 oldParent.insert(myIndex, replaceWith)
99 """Destructively rips this element out of the tree."""
102 self.parent.contents.remove(self)
106 #Find the two elements that would be next to each other if
107 #this element (and any children) hadn't been parsed. Connect
109 lastChild = self._lastRecursiveChild()
110 nextElement = lastChild.next
113 self.previous.next = nextElement
115 nextElement.previous = self.previous
117 lastChild.next = None
120 if self.previousSibling:
121 self.previousSibling.nextSibling = self.nextSibling
123 self.nextSibling.previousSibling = self.previousSibling
124 self.previousSibling = self.nextSibling = None
126 def _lastRecursiveChild(self):
127 "Finds the last element beneath this object to be parsed."
129 while hasattr(lastChild, 'contents') and lastChild.contents:
130 lastChild = lastChild.contents[-1]
133 def insert(self, position, newChild):
134 if (isinstance(newChild, basestring)
135 or isinstance(newChild, unicode)) \
136 and not isinstance(newChild, NavigableString):
137 newChild = NavigableString(newChild)
139 position = min(position, len(self.contents))
140 if hasattr(newChild, 'parent') and newChild.parent != None:
141 # We're 'inserting' an element that's already one
142 # of this object's children.
143 if newChild.parent == self:
144 index = self.find(newChild)
145 if index and index < position:
146 # Furthermore we're moving it further down the
147 # list of this object's children. That means that
148 # when we extract this element, our target index
149 # will jump down one.
150 position = position - 1
153 newChild.parent = self
156 newChild.previousSibling = None
157 newChild.previous = self
159 previousChild = self.contents[position-1]
160 newChild.previousSibling = previousChild
161 newChild.previousSibling.nextSibling = newChild
162 newChild.previous = previousChild._lastRecursiveChild()
163 if newChild.previous:
164 newChild.previous.next = newChild
166 newChildsLastElement = newChild._lastRecursiveChild()
168 if position >= len(self.contents):
169 newChild.nextSibling = None
172 parentsNextSibling = None
173 while not parentsNextSibling:
174 parentsNextSibling = parent.nextSibling
175 parent = parent.parent
176 if not parent: # This is the last element in the document.
178 if parentsNextSibling:
179 newChildsLastElement.next = parentsNextSibling
181 newChildsLastElement.next = None
183 nextChild = self.contents[position]
184 newChild.nextSibling = nextChild
185 if newChild.nextSibling:
186 newChild.nextSibling.previousSibling = newChild
187 newChildsLastElement.next = nextChild
189 if newChildsLastElement.next:
190 newChildsLastElement.next.previous = newChildsLastElement
191 self.contents.insert(position, newChild)
193 def findNext(self, name=None, attrs={}, text=None, **kwargs):
194 """Returns the first item that matches the given criteria and
195 appears after this Tag in the document."""
196 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
198 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
200 """Returns all items that match the given criteria and appear
201 before after Tag in the document."""
202 return self._findAll(name, attrs, text, limit, self.nextGenerator)
204 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205 """Returns the closest sibling to this Tag that matches the
206 given criteria and appears after this Tag in the document."""
207 return self._findOne(self.findNextSiblings, name, attrs, text,
210 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
212 """Returns the siblings of this Tag that match the given
213 criteria and appear after this Tag in the document."""
214 return self._findAll(name, attrs, text, limit,
215 self.nextSiblingGenerator, **kwargs)
216 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
218 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219 """Returns the first item that matches the given criteria and
220 appears before this Tag in the document."""
221 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
223 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
225 """Returns all items that match the given criteria and appear
226 before this Tag in the document."""
227 return self._findAll(name, attrs, text, limit, self.previousGenerator,
229 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
231 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232 """Returns the closest sibling to this Tag that matches the
233 given criteria and appears before this Tag in the document."""
234 return self._findOne(self.findPreviousSiblings, name, attrs, text,
237 def findPreviousSiblings(self, name=None, attrs={}, text=None,
238 limit=None, **kwargs):
239 """Returns the siblings of this Tag that match the given
240 criteria and appear before this Tag in the document."""
241 return self._findAll(name, attrs, text, limit,
242 self.previousSiblingGenerator, **kwargs)
243 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
245 def findParent(self, name=None, attrs={}, **kwargs):
246 """Returns the closest parent of this Tag that matches the given
248 # NOTE: We can't use _findOne because findParents takes a different
251 l = self.findParents(name, attrs, 1)
256 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257 """Returns the parents of this Tag that match the given
260 return self._findAll(name, attrs, None, limit, self.parentGenerator,
262 fetchParents = findParents # Compatibility with pre-3.x
264 #These methods do the real heavy lifting.
266 def _findOne(self, method, name, attrs, text, **kwargs):
268 l = method(name, attrs, text, 1, **kwargs)
273 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274 "Iterates over a generator looking for things that match."
276 if isinstance(name, SoupStrainer):
279 # Build a SoupStrainer
280 strainer = SoupStrainer(name, attrs, text, **kwargs)
281 results = ResultSet(strainer)
286 except StopIteration:
289 found = strainer.search(i)
291 results.append(found)
292 if limit and len(results) >= limit:
296 #These Generators can be used to navigate starting from both
297 #NavigableStrings and Tags.
298 def nextGenerator(self):
304 def nextSiblingGenerator(self):
310 def previousGenerator(self):
316 def previousSiblingGenerator(self):
319 i = i.previousSibling
322 def parentGenerator(self):
329 def substituteEncoding(self, str, encoding=None):
330 encoding = encoding or "utf-8"
331 return str.replace("%SOUP-ENCODING%", encoding)
333 def toEncoding(self, s, encoding=None):
334 """Encodes an object to a string in some encoding, or to Unicode.
336 if isinstance(s, unicode):
338 s = s.encode(encoding)
339 elif isinstance(s, str):
341 s = s.encode(encoding)
346 s = self.toEncoding(str(s), encoding)
351 class NavigableString(unicode, PageElement):
353 def __getattr__(self, attr):
354 """text.string gives you text. This is for backwards
355 compatibility for Navigable*String, but for CData* it lets you
356 get the string without the CData wrapper."""
360 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
362 def __unicode__(self):
363 return self.__str__(None)
365 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
367 return self.encode(encoding)
371 class CData(NavigableString):
373 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
376 class ProcessingInstruction(NavigableString):
377 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
379 if "%SOUP-ENCODING%" in output:
380 output = self.substituteEncoding(output, encoding)
381 return "<?%s?>" % self.toEncoding(output, encoding)
383 class Comment(NavigableString):
384 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385 return "<!--%s-->" % NavigableString.__str__(self, encoding)
387 class Declaration(NavigableString):
388 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389 return "<!%s>" % NavigableString.__str__(self, encoding)
391 class Tag(PageElement):
393 """Represents a found HTML tag with its attributes and contents."""
395 XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
401 def __init__(self, parser, name, attrs=None, parent=None,
405 # We don't actually store the parser object: that lets extracted
406 # chunks be garbage-collected
407 self.parserClass = parser.__class__
408 self.isSelfClosing = parser.isSelfClosingTag(name)
414 self.setup(parent, previous)
416 self.containsSubstitutions = False
418 def get(self, key, default=None):
419 """Returns the value of the 'key' attribute for the tag, or
420 the value given for 'default' if it doesn't have that
422 return self._getAttrMap().get(key, default)
424 def has_key(self, key):
425 return self._getAttrMap().has_key(key)
427 def __getitem__(self, key):
428 """tag[key] returns the value of the 'key' attribute for the tag,
429 and throws an exception if it's not there."""
430 return self._getAttrMap()[key]
433 "Iterating over a tag iterates over its contents."
434 return iter(self.contents)
437 "The length of a tag is the length of its list of contents."
438 return len(self.contents)
440 def __contains__(self, x):
441 return x in self.contents
443 def __nonzero__(self):
444 "A tag is non-None even if it has no contents."
447 def __setitem__(self, key, value):
448 """Setting tag[key] sets the value of the 'key' attribute for the
451 self.attrMap[key] = value
453 for i in range(0, len(self.attrs)):
454 if self.attrs[i][0] == key:
455 self.attrs[i] = (key, value)
458 self.attrs.append((key, value))
459 self._getAttrMap()[key] = value
461 def __delitem__(self, key):
462 "Deleting tag[key] deletes all 'key' attributes for the tag."
463 for item in self.attrs:
465 self.attrs.remove(item)
466 #We don't break because bad HTML can define the same
467 #attribute multiple times.
469 if self.attrMap.has_key(key):
470 del self.attrMap[key]
472 def __call__(self, *args, **kwargs):
473 """Calling a tag like a function is the same as calling its
474 findAll() method. Eg. tag('a') returns a list of all the A tags
475 found within this tag."""
476 return apply(self.findAll, args, kwargs)
478 def __getattr__(self, tag):
479 #print "Getattr %s.%s" % (self.__class__, tag)
480 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481 return self.find(tag[:-3])
482 elif tag.find('__') != 0:
483 return self.find(tag)
485 def __eq__(self, other):
486 """Returns true iff this tag has the same name, the same attributes,
487 and the same contents (recursively) as the given tag.
489 NOTE: right now this will return false if two tags have the
490 same attributes in a different order. Should this be fixed?"""
491 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
493 for i in range(0, len(self.contents)):
494 if self.contents[i] != other.contents[i]:
498 def __ne__(self, other):
499 """Returns true iff this tag is not identical to the other tag,
500 as defined in __eq__."""
501 return not self == other
503 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504 """Renders this tag as a string."""
505 return self.__str__(encoding)
507 def __unicode__(self):
508 return self.__str__(None)
510 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511 prettyPrint=False, indentLevel=0):
512 """Returns a string or Unicode representation of this tag and
513 its contents. To get Unicode, pass None for encoding.
515 NOTE: since Python's HTML parser consumes whitespace, this
516 method is not certain to reproduce the whitespace present in
517 the original string."""
519 encodedName = self.toEncoding(self.name, encoding)
523 for key, val in self.attrs:
526 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527 val = self.substituteEncoding(val, encoding)
529 # The attribute value either:
531 # * Contains no embedded double quotes or single quotes.
532 # No problem: we enclose it in double quotes.
533 # * Contains embedded single quotes. No problem:
534 # double quotes work here too.
535 # * Contains embedded double quotes. No problem:
536 # we enclose it in single quotes.
537 # * Embeds both single _and_ double quotes. This
538 # can't happen naturally, but it can happen if
539 # you modify an attribute value after parsing
540 # the document. Now we have a bit of a
541 # problem. We solve it by enclosing the
542 # attribute in single quotes, and escaping any
543 # embedded single quotes to XML entities.
546 # This can't happen naturally, but it can happen
547 # if you modify an attribute value after parsing.
549 val = val.replace("'", "&squot;")
551 # Now we're okay w/r/t quotes. But the attribute
552 # value might also contain angle brackets, or
553 # ampersands that aren't part of entities. We need
554 # to escape those to XML entities too.
555 val = re.sub("([<>]|&(?![^\s]+;))",
556 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
559 attrs.append(fmt % (self.toEncoding(key, encoding),
560 self.toEncoding(val, encoding)))
563 if self.isSelfClosing:
566 closeTag = '</%s>' % encodedName
568 indentTag, indentContents = 0, 0
570 indentTag = indentLevel
571 space = (' ' * (indentTag-1))
572 indentContents = indentTag + 1
573 contents = self.renderContents(encoding, prettyPrint, indentContents)
580 attributeString = ' ' + ' '.join(attrs)
583 s.append('<%s%s%s>' % (encodedName, attributeString, close))
587 if prettyPrint and contents and contents[-1] != "\n":
589 if prettyPrint and closeTag:
592 if prettyPrint and closeTag and self.nextSibling:
597 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598 return self.__str__(encoding, True)
600 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601 prettyPrint=False, indentLevel=0):
602 """Renders the contents of this tag as a string in the given
603 encoding. If encoding is None, returns a Unicode string.."""
607 if isinstance(c, NavigableString):
608 text = c.__str__(encoding)
609 elif isinstance(c, Tag):
610 s.append(c.__str__(encoding, prettyPrint, indentLevel))
611 if text and prettyPrint:
615 s.append(" " * (indentLevel-1))
623 def find(self, name=None, attrs={}, recursive=True, text=None,
625 """Return only the first child of this Tag matching the given
628 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
634 def findAll(self, name=None, attrs={}, recursive=True, text=None,
635 limit=None, **kwargs):
636 """Extracts a list of Tag objects that match the given
637 criteria. You can specify the name of the Tag and any
638 attributes you want the Tag to have.
640 The value of a key-value pair in the 'attrs' map can be a
641 string, a list of strings, a regular expression object, or a
642 callable that takes a string and returns whether or not the
643 string matches for some custom definition of 'matches'. The
644 same is true of the tag name."""
645 generator = self.recursiveChildGenerator
647 generator = self.childGenerator
648 return self._findAll(name, attrs, text, limit, generator, **kwargs)
649 findChildren = findAll
651 # Pre-3.x compatibility methods
655 def fetchText(self, text=None, recursive=True, limit=None):
656 return self.findAll(text=text, recursive=recursive, limit=limit)
658 def firstText(self, text=None, recursive=True):
659 return self.find(text=text, recursive=recursive)
663 def append(self, tag):
664 """Appends the given tag to the contents of this tag."""
665 self.contents.append(tag)
669 def _getAttrMap(self):
670 """Initializes a map representation of this tag's attributes,
671 if not already initialized."""
672 if not getattr(self, 'attrMap'):
674 for (key, value) in self.attrs:
675 self.attrMap[key] = value
679 def childGenerator(self):
680 for i in range(0, len(self.contents)):
681 yield self.contents[i]
684 def recursiveChildGenerator(self):
687 tag, start = stack.pop()
688 if isinstance(tag, Tag):
689 for i in range(start, len(tag.contents)):
692 if isinstance(a, Tag) and tag.contents:
693 if i < len(tag.contents) - 1:
694 stack.append((tag, i+1))
699 # Next, a couple classes to represent queries and their results.
701 """Encapsulates a number of ways of matching a markup element (tag or
704 def __init__(self, name=None, attrs={}, text=None, **kwargs):
707 kwargs['class'] = attrs
722 return "%s|%s" % (self.name, self.attrs)
724 def searchTag(self, markupName=None, markupAttrs={}):
727 if isinstance(markupName, Tag):
730 callFunctionWithTagData = callable(self.name) \
731 and not isinstance(markupName, Tag)
734 or callFunctionWithTagData \
735 or (markup and self._matches(markup, self.name)) \
736 or (not markup and self._matches(markupName, self.name)):
737 if callFunctionWithTagData:
738 match = self.name(markupName, markupAttrs)
742 for attr, matchAgainst in self.attrs.items():
743 if not markupAttrMap:
744 if hasattr(markupAttrs, 'get'):
745 markupAttrMap = markupAttrs
748 for k,v in markupAttrs:
750 attrValue = markupAttrMap.get(attr)
751 if not self._matches(attrValue, matchAgainst):
761 def search(self, markup):
762 #print 'looking for %s in %s' % (self, markup)
764 # If given a list of items, scan it for a text element that
766 if isList(markup) and not isinstance(markup, Tag):
767 for element in markup:
768 if isinstance(element, NavigableString) \
769 and self.search(element):
772 # If it's a Tag, make sure its name or attributes match.
773 # Don't bother with Tags if we're searching for text.
774 elif isinstance(markup, Tag):
776 found = self.searchTag(markup)
777 # If it's text, make sure the text matches.
778 elif isinstance(markup, NavigableString) or \
780 if self._matches(markup, self.text):
783 raise Exception, "I don't know how to match against a %s" \
787 def _matches(self, markup, matchAgainst):
788 #print "Matching %s against %s" % (markup, matchAgainst)
790 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
791 result = markup != None
792 elif callable(matchAgainst):
793 result = matchAgainst(markup)
795 #Custom match methods take the tag as an argument, but all
796 #other ways of matching match the tag name as a string.
797 if isinstance(markup, Tag):
799 if markup and not isString(markup):
800 markup = unicode(markup)
801 #Now we know that chunk is either a string, or None.
802 if hasattr(matchAgainst, 'match'):
803 # It's a regexp object.
804 result = markup and matchAgainst.search(markup)
805 elif isList(matchAgainst):
806 result = markup in matchAgainst
807 elif hasattr(matchAgainst, 'items'):
808 result = markup.has_key(matchAgainst)
809 elif matchAgainst and isString(markup):
810 if isinstance(markup, unicode):
811 matchAgainst = unicode(matchAgainst)
813 matchAgainst = str(matchAgainst)
816 result = matchAgainst == markup
819 class ResultSet(list):
820 """A ResultSet is just a list that keeps track of the SoupStrainer
822 def __init__(self, source):
826 # Now, some helper functions.
829 """Convenience method that works with all 2.x versions of Python
830 to determine whether or not something is listlike."""
831 return hasattr(l, '__iter__') \
832 or (type(l) in (types.ListType, types.TupleType))
835 """Convenience method that works with all 2.x versions of Python
836 to determine whether or not something is stringlike."""
838 return isinstance(s, unicode) or isintance(s, basestring)
840 return isinstance(s, str)
842 def buildTagMap(default, *args):
843 """Turns a list of maps, lists, or scalars into a single map.
844 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
845 NESTING_RESET_TAGS maps out of lists and partial maps."""
848 if hasattr(portion, 'items'):
849 #It's a map. Merge it.
850 for k,v in portion.items():
852 elif isList(portion):
853 #It's a list. Map each item to the default.
857 #It's a scalar. Map it to the default.
858 built[portion] = default
861 # Now, the parser classes.
863 class BeautifulStoneSoup(Tag, SGMLParser):
865 """This class contains the basic parser and search code. It defines
866 a parser that knows nothing about tag behavior except for the
869 You can't close a tag without closing all the tags it encloses.
870 That is, "<foo><bar></foo>" actually means
871 "<foo><bar></bar></foo>".
873 [Another possible explanation is "<foo><bar /></foo>", but since
874 this class defines no SELF_CLOSING_TAGS, it will never use that
877 This class is useful for parsing XML or made-up markup languages,
878 or when BeautifulSoup makes an assumption counter to what you were
882 for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
883 XML_ENTITY_LIST[i] = True
885 SELF_CLOSING_TAGS = {}
887 RESET_NESTING_TAGS = {}
890 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
891 lambda x: x.group(1) + ' />'),
892 (re.compile('<!\s+([^<>]*)>'),
893 lambda x: '<!' + x.group(1) + '>')
896 ROOT_TAG_NAME = u'[document]'
898 HTML_ENTITIES = "html"
901 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
902 markupMassage=True, smartQuotesTo=XML_ENTITIES,
903 convertEntities=None, selfClosingTags=None):
904 """The Soup object is initialized as the 'root tag', and the
905 provided markup (which can be a string or a file-like object)
906 is fed into the underlying parser.
908 sgmllib will process most bad HTML, and the BeautifulSoup
909 class has some tricks for dealing with some HTML that kills
910 sgmllib, but Beautiful Soup can nonetheless choke or lose data
911 if your data uses self-closing tags or declarations
914 By default, Beautiful Soup uses regexes to sanitize input,
915 avoiding the vast majority of these problems. If the problems
916 don't apply to you, pass in False for markupMassage, and
917 you'll get better performance.
919 The default parser massage techniques fix the two most common
920 instances of invalid HTML that choke sgmllib:
922 <br/> (No space between name of closing tag and tag close)
923 <! --Comment--> (Extraneous whitespace in declaration)
925 You can pass in a custom list of (RE object, replace method)
926 tuples to get Beautiful Soup to scrub your input the way you
929 self.parseOnlyThese = parseOnlyThese
930 self.fromEncoding = fromEncoding
931 self.smartQuotesTo = smartQuotesTo
932 self.convertEntities = convertEntities
933 if self.convertEntities:
934 # It doesn't make sense to convert encoded characters to
935 # entities even while you're converting entities to Unicode.
936 # Just convert it all to Unicode.
937 self.smartQuotesTo = None
938 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
939 SGMLParser.__init__(self)
941 if hasattr(markup, 'read'): # It's a file-type object.
942 markup = markup.read()
944 self.markupMassage = markupMassage
949 self.markup = None # The markup can now be GCed
951 def _feed(self, inDocumentEncoding=None):
952 # Convert the document to Unicode.
954 if isinstance(markup, unicode):
955 if not hasattr(self, 'originalEncoding'):
956 self.originalEncoding = None
958 dammit = UnicodeDammit\
959 (markup, [self.fromEncoding, inDocumentEncoding],
960 smartQuotesTo=self.smartQuotesTo)
961 markup = dammit.unicode
962 self.originalEncoding = dammit.originalEncoding
964 if self.markupMassage:
965 if not isList(self.markupMassage):
966 self.markupMassage = self.MARKUP_MASSAGE
967 for fix, m in self.markupMassage:
968 markup = fix.sub(m, markup)
971 SGMLParser.feed(self, markup)
972 # Close out any unfinished strings and close all the open tags.
974 while self.currentTag.name != self.ROOT_TAG_NAME:
977 def __getattr__(self, methodName):
978 """This method routes method call requests to either the SGMLParser
979 superclass or the Tag superclass, depending on the method name."""
980 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
982 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
983 or methodName.find('do_') == 0:
984 return SGMLParser.__getattr__(self, methodName)
985 elif methodName.find('__') != 0:
986 return Tag.__getattr__(self, methodName)
990 def isSelfClosingTag(self, name):
991 """Returns true iff the given string is the name of a
992 self-closing tag according to this parser."""
993 return self.SELF_CLOSING_TAGS.has_key(name) \
994 or self.instanceSelfClosingTags.has_key(name)
997 Tag.__init__(self, self, self.ROOT_TAG_NAME)
999 SGMLParser.reset(self)
1000 self.currentData = []
1001 self.currentTag = None
1003 self.quoteStack = []
1007 tag = self.tagStack.pop()
1008 # Tags with just one string-owning child get the child as a
1009 # 'string' property, so that soup.tag.string is shorthand for
1010 # soup.tag.contents[0]
1011 if len(self.currentTag.contents) == 1 and \
1012 isinstance(self.currentTag.contents[0], NavigableString):
1013 self.currentTag.string = self.currentTag.contents[0]
1015 #print "Pop", tag.name
1017 self.currentTag = self.tagStack[-1]
1018 return self.currentTag
1020 def pushTag(self, tag):
1021 #print "Push", tag.name
1023 self.currentTag.append(tag)
1024 self.tagStack.append(tag)
1025 self.currentTag = self.tagStack[-1]
1027 def endData(self, containerClass=NavigableString):
1028 if self.currentData:
1029 currentData = ''.join(self.currentData)
1030 if not currentData.strip():
1031 if '\n' in currentData:
1035 self.currentData = []
1036 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1037 (not self.parseOnlyThese.text or \
1038 not self.parseOnlyThese.search(currentData)):
1040 o = containerClass(currentData)
1041 o.setup(self.currentTag, self.previous)
1043 self.previous.next = o
1045 self.currentTag.contents.append(o)
1048 def _popToTag(self, name, inclusivePop=True):
1049 """Pops the tag stack up to and including the most recent
1050 instance of the given tag. If inclusivePop is false, pops the tag
1051 stack up to but *not* including the most recent instqance of
1053 #print "Popping to %s" % name
1054 if name == self.ROOT_TAG_NAME:
1058 mostRecentTag = None
1059 for i in range(len(self.tagStack)-1, 0, -1):
1060 if name == self.tagStack[i].name:
1061 numPops = len(self.tagStack)-i
1063 if not inclusivePop:
1064 numPops = numPops - 1
1066 for i in range(0, numPops):
1067 mostRecentTag = self.popTag()
1068 return mostRecentTag
1070 def _smartPop(self, name):
1072 """We need to pop up to the previous tag of this type, unless
1073 one of this tag's nesting reset triggers comes between this
1074 tag and the previous tag of this type, OR unless this tag is a
1075 generic nesting trigger and another generic nesting trigger
1076 comes between this tag and the previous tag of this type.
1079 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080 <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081 <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1084 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1086 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1089 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1090 isNestable = nestingResetTriggers != None
1091 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1094 for i in range(len(self.tagStack)-1, 0, -1):
1095 p = self.tagStack[i]
1096 if (not p or p.name == name) and not isNestable:
1097 #Non-nestable tags get popped to the top or to their
1101 if (nestingResetTriggers != None
1102 and p.name in nestingResetTriggers) \
1103 or (nestingResetTriggers == None and isResetNesting
1104 and self.RESET_NESTING_TAGS.has_key(p.name)):
1106 #If we encounter one of the nesting reset triggers
1107 #peculiar to this tag, or we encounter another tag
1108 #that causes nesting to reset, pop up to but not
1109 #including that tag.
1115 self._popToTag(popTo, inclusive)
1117 def unknown_starttag(self, name, attrs, selfClosing=0):
1118 #print "Start tag %s: %s" % (name, attrs)
1120 #This is not a real tag.
1121 #print "<%s> is not real!" % name
1122 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123 self.handle_data('<%s%s>' % (name, attrs))
1127 if not self.isSelfClosingTag(name) and not selfClosing:
1128 self._smartPop(name)
1130 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1131 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1134 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1136 self.previous.next = tag
1139 if selfClosing or self.isSelfClosingTag(name):
1141 if name in self.QUOTE_TAGS:
1142 #print "Beginning quote (%s)" % name
1143 self.quoteStack.append(name)
1147 def unknown_endtag(self, name):
1148 #print "End tag %s" % name
1149 if self.quoteStack and self.quoteStack[-1] != name:
1150 #This is not a real end tag.
1151 #print "</%s> is not real!" % name
1152 self.handle_data('</%s>' % name)
1155 self._popToTag(name)
1156 if self.quoteStack and self.quoteStack[-1] == name:
1157 self.quoteStack.pop()
1158 self.literal = (len(self.quoteStack) > 0)
1160 def handle_data(self, data):
1161 self.currentData.append(data)
1163 def _toStringSubclass(self, text, subclass):
1164 """Adds a certain piece of text to the tree as a NavigableString
1167 self.handle_data(text)
1168 self.endData(subclass)
1170 def handle_pi(self, text):
1171 """Handle a processing instruction as a ProcessingInstruction
1172 object, possibly one with a %SOUP-ENCODING% slot into which an
1173 encoding will be plugged later."""
1174 if text[:3] == "xml":
1175 text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1176 self._toStringSubclass(text, ProcessingInstruction)
1178 def handle_comment(self, text):
1179 "Handle comments as Comment objects."
1180 self._toStringSubclass(text, Comment)
1182 def handle_charref(self, ref):
1183 "Handle character references as data."
1184 if self.convertEntities in [self.HTML_ENTITIES,
1186 data = unichr(int(ref))
1188 data = '&#%s;' % ref
1189 self.handle_data(data)
1191 def handle_entityref(self, ref):
1192 """Handle entity references as data, possibly converting known
1193 HTML entity references to the corresponding Unicode
1196 if self.convertEntities == self.HTML_ENTITIES or \
1197 (self.convertEntities == self.XML_ENTITIES and \
1198 self.XML_ENTITY_LIST.get(ref)):
1200 data = unichr(name2codepoint[ref])
1205 self.handle_data(data)
1207 def handle_decl(self, data):
1208 "Handle DOCTYPEs and the like as Declaration objects."
1209 self._toStringSubclass(data, Declaration)
1211 def parse_declaration(self, i):
1212 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1213 declaration as a CData object."""
1215 if self.rawdata[i:i+9] == '<![CDATA[':
1216 k = self.rawdata.find(']]>', i)
1218 k = len(self.rawdata)
1219 data = self.rawdata[i+9:k]
1221 self._toStringSubclass(data, CData)
1224 j = SGMLParser.parse_declaration(self, i)
1225 except SGMLParseError:
1226 toHandle = self.rawdata[i:]
1227 self.handle_data(toHandle)
1228 j = i + len(toHandle)
1231 class BeautifulSoup(BeautifulStoneSoup):
1233 """This parser knows the following facts about HTML:
1235 * Some tags have no closing tag and should be interpreted as being
1236 closed as soon as they are encountered.
1238 * The text inside some tags (ie. 'script') may contain tags which
1239 are not really part of the document and which should be parsed
1240 as text, not tags. If you want to parse the text as tags, you can
1241 always fetch it and parse it explicitly.
1243 * Tag nesting rules:
1245 Most tags can't be nested at all. For instance, the occurance of
1246 a <p> tag should implicitly close the previous <p> tag.
1249 should be transformed into:
1250 <p>Para1</p><p>Para2
1252 Some tags can be nested arbitrarily. For instance, the occurance
1253 of a <blockquote> tag should _not_ implicitly close the previous
1256 Alice said: <blockquote>Bob said: <blockquote>Blah
1257 should NOT be transformed into:
1258 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1260 Some tags can be nested, but the nesting is reset by the
1261 interposition of other tags. For instance, a <tr> tag should
1262 implicitly close the previous <tr> tag within the same <table>,
1263 but not close a <tr> tag in another table.
1265 <table><tr>Blah<tr>Blah
1266 should be transformed into:
1267 <table><tr>Blah</tr><tr>Blah
1269 <tr>Blah<table><tr>Blah
1270 should NOT be transformed into
1271 <tr>Blah<table></tr><tr>Blah
1273 Differing assumptions about tag nesting rules are a major source
1274 of problems with the BeautifulSoup class. If BeautifulSoup is not
1275 treating as nestable a tag your page author treats as nestable,
1276 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1277 BeautifulStoneSoup before writing your own subclass."""
1279 def __init__(self, *args, **kwargs):
1280 if not kwargs.has_key('smartQuotesTo'):
1281 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1282 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1284 SELF_CLOSING_TAGS = buildTagMap(None,
1285 ['br' , 'hr', 'input', 'img', 'meta',
1286 'spacer', 'link', 'frame', 'base'])
1288 QUOTE_TAGS = {'script': None}
1290 #According to the HTML standard, each of these inline tags can
1291 #contain another tag of the same type. Furthermore, it's common
1292 #to actually use these tags this way.
1293 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1296 #According to the HTML standard, these block tags can contain
1297 #another tag of the same type. Furthermore, it's common
1298 #to actually use these tags this way.
1299 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1301 #Lists can contain other lists, but there are restrictions.
1302 NESTABLE_LIST_TAGS = { 'ol' : [],
1304 'li' : ['ul', 'ol'],
1309 #Tables can contain other tables, but there are restrictions.
1310 NESTABLE_TABLE_TAGS = {'table' : [],
1311 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1314 'thead' : ['table'],
1315 'tbody' : ['table'],
1316 'tfoot' : ['table'],
1319 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1321 #If one of these tags is encountered, all tags up to the next tag of
1322 #this type are popped.
1323 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1324 NON_NESTABLE_BLOCK_TAGS,
1326 NESTABLE_TABLE_TAGS)
1328 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1329 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1331 # Used to detect the charset in a META tag; see start_meta
1332 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1334 def start_meta(self, attrs):
1335 """Beautiful Soup can detect a charset included in a META tag,
1336 try to convert the document to that charset, and re-parse the
1337 document from the beginning."""
1340 contentTypeIndex = None
1341 tagNeedsEncodingSubstitution = False
1343 for i in range(0, len(attrs)):
1344 key, value = attrs[i]
1346 if key == 'http-equiv':
1348 elif key == 'content':
1350 contentTypeIndex = i
1352 if httpEquiv and contentType: # It's an interesting meta tag.
1353 match = self.CHARSET_RE.search(contentType)
1355 if getattr(self, 'declaredHTMLEncoding') or \
1356 (self.originalEncoding == self.fromEncoding):
1357 # This is our second pass through the document, or
1358 # else an encoding was specified explicitly and it
1359 # worked. Rewrite the meta tag.
1360 newAttr = self.CHARSET_RE.sub\
1361 (lambda(match):match.group(1) +
1362 "%SOUP-ENCODING%", value)
1363 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1365 tagNeedsEncodingSubstitution = True
1367 # This is our first pass through the document.
1368 # Go through it again with the new information.
1369 newCharset = match.group(3)
1370 if newCharset and newCharset != self.originalEncoding:
1371 self.declaredHTMLEncoding = newCharset
1372 self._feed(self.declaredHTMLEncoding)
1374 tag = self.unknown_starttag("meta", attrs)
1375 if tag and tagNeedsEncodingSubstitution:
1376 tag.containsSubstitutions = True
1378 class StopParsing(Exception):
1381 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1383 """The BeautifulSoup class is oriented towards skipping over
1384 common HTML errors like unclosed tags. However, sometimes it makes
1385 errors of its own. For instance, consider this fragment:
1387 <b>Foo<b>Bar</b></b>
1389 This is perfectly valid (if bizarre) HTML. However, the
1390 BeautifulSoup class will implicitly close the first b tag when it
1391 encounters the second 'b'. It will think the author wrote
1392 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1393 there's no real-world reason to bold something that's already
1394 bold. When it encounters '</b></b>' it will close two more 'b'
1395 tags, for a grand total of three tags closed instead of two. This
1396 can throw off the rest of your document structure. The same is
1397 true of a number of other tags, listed below.
1399 It's much more common for someone to forget to close a 'b' tag
1400 than to actually use nested 'b' tags, and the BeautifulSoup class
1401 handles the common case. This class handles the not-co-common
1402 case: where you can't believe someone wrote what they did, but
1403 it's valid HTML and BeautifulSoup screwed up by assuming it
1406 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1407 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1408 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1411 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1413 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1414 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1415 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1417 class MinimalSoup(BeautifulSoup):
1418 """The MinimalSoup class is for parsing HTML that contains
1419 pathologically bad markup. It makes no assumptions about tag
1420 nesting, but it does know which tags are self-closing, that
1421 <script> tags contain Javascript and should not be parsed, that
1422 META tags may contain encoding information, and so on.
1424 This also makes it better for subclassing than BeautifulStoneSoup
1425 or BeautifulSoup."""
1427 RESET_NESTING_TAGS = buildTagMap('noscript')
1430 class BeautifulSOAP(BeautifulStoneSoup):
1431 """This class will push a tag with only a single string child into
1432 the tag's parent as an attribute. The attribute's name is the tag
1433 name, and the value is the string child. An example should give
1434 the flavor of the change:
1436 <foo><bar>baz</bar></foo>
1438 <foo bar="baz"><bar>baz</bar></foo>
1440 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1442 This is, of course, useful for scraping structures that tend to
1443 use subelements instead of attributes, such as SOAP messages. Note
1444 that it modifies its input, so don't print the modified version
1447 I'm not sure how many people really want to use this class; let me
1448 know if you do. Mainly I like the name."""
1451 if len(self.tagStack) > 1:
1452 tag = self.tagStack[-1]
1453 parent = self.tagStack[-2]
1454 parent._getAttrMap()
1455 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456 isinstance(tag.contents[0], NavigableString) and
1457 not parent.attrMap.has_key(tag.name)):
1458 parent[tag.name] = tag.contents[0]
1459 BeautifulStoneSoup.popTag(self)
1461 #Enterprise class names! It has come to our attention that some people
1462 #think the names of the Beautiful Soup parser classes are too silly
1463 #and "unprofessional" for use in enterprise screen-scraping. We feel
1464 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1465 #All-Night Kosher Bakery recommends renaming this file to
1466 #"RobustParser.py" (or, in cases of extreme enterprisness,
1467 #"RobustParserBeanInterface.class") and using the following
1468 #enterprise-friendly class aliases:
1469 class RobustXMLParser(BeautifulStoneSoup):
1471 class RobustHTMLParser(BeautifulSoup):
1473 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1475 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1477 class SimplifyingSOAPParser(BeautifulSOAP):
1480 ######################################################
1482 # Bonus library: Unicode, Dammit
1484 # This class forces XML data into a standard format (usually to UTF-8
1485 # or Unicode). It is heavily based on code from Mark Pilgrim's
1486 # Universal Feed Parser. It does not rewrite the XML or HTML to
1487 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1488 # (XML) and BeautifulSoup.start_meta (HTML).
1490 # Autodetects character encodings.
1491 # Download from http://chardet.feedparser.org/
1494 # import chardet.constants
1495 # chardet.constants._debug = 1
1500 # cjkcodecs and iconv_codec make Python know about more character encodings.
1501 # Both are available from http://cjkpython.i18n.org/
1502 # They're built in if you use Python 2.4.
1504 import cjkcodecs.aliases
1512 class UnicodeDammit:
1513 """A class for detecting the encoding of a *ML document and
1514 converting it to a Unicode string. If the source encoding is
1515 windows-1252, can replace MS smart quotes with their HTML or XML
1518 # This dictionary maps commonly seen values for "charset" in HTML
1519 # meta tags to the corresponding Python codec names. It only covers
1520 # values that aren't in Python's aliases and can't be determined
1521 # by the heuristics in find_codec.
1522 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1523 "x-sjis" : "shift-jis" }
1525 def __init__(self, markup, overrideEncodings=[],
1526 smartQuotesTo='xml'):
1527 self.markup, documentEncoding, sniffedEncoding = \
1528 self._detectEncoding(markup)
1529 self.smartQuotesTo = smartQuotesTo
1530 self.triedEncodings = []
1531 if markup == '' or isinstance(markup, unicode):
1532 self.originalEncoding = None
1533 self.unicode = unicode(markup)
1537 for proposedEncoding in overrideEncodings:
1538 u = self._convertFrom(proposedEncoding)
1541 for proposedEncoding in (documentEncoding, sniffedEncoding):
1542 u = self._convertFrom(proposedEncoding)
1545 # If no luck and we have auto-detection library, try that:
1546 if not u and chardet and not isinstance(self.markup, unicode):
1547 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1549 # As a last resort, try utf-8 and windows-1252:
1551 for proposed_encoding in ("utf-8", "windows-1252"):
1552 u = self._convertFrom(proposed_encoding)
1555 if not u: self.originalEncoding = None
1557 def _subMSChar(self, orig):
1558 """Changes a MS smart quote character to an XML or HTML
1560 sub = self.MS_CHARS.get(orig)
1561 if type(sub) == types.TupleType:
1562 if self.smartQuotesTo == 'xml':
1563 sub = '&#x%s;' % sub[1]
1565 sub = '&%s;' % sub[0]
1568 def _convertFrom(self, proposed):
1569 proposed = self.find_codec(proposed)
1570 if not proposed or proposed in self.triedEncodings:
1572 self.triedEncodings.append(proposed)
1573 markup = self.markup
1575 # Convert smart quotes to HTML if coming from an encoding
1576 # that might have them.
1577 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1580 markup = re.compile("([\x80-\x9f])").sub \
1581 (lambda(x): self._subMSChar(x.group(1)),
1585 # print "Trying to convert document to %s" % proposed
1586 u = self._toUnicode(markup, proposed)
1588 self.originalEncoding = proposed
1589 except Exception, e:
1590 # print "That didn't work!"
1593 #print "Correct encoding: %s" % proposed
1596 def _toUnicode(self, data, encoding):
1597 '''Given a string and its encoding, decodes the string into Unicode.
1598 %encoding is a string recognized by encodings.aliases'''
1600 # strip Byte Order Mark (if present)
1601 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1602 and (data[2:4] != '\x00\x00'):
1603 encoding = 'utf-16be'
1605 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1606 and (data[2:4] != '\x00\x00'):
1607 encoding = 'utf-16le'
1609 elif data[:3] == '\xef\xbb\xbf':
1612 elif data[:4] == '\x00\x00\xfe\xff':
1613 encoding = 'utf-32be'
1615 elif data[:4] == '\xff\xfe\x00\x00':
1616 encoding = 'utf-32le'
1618 newdata = unicode(data, encoding)
1621 def _detectEncoding(self, xml_data):
1622 """Given a document, tries to detect its XML encoding."""
1623 xml_encoding = sniffed_xml_encoding = None
1625 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1627 xml_data = self._ebcdic_to_ascii(xml_data)
1628 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1630 sniffed_xml_encoding = 'utf-16be'
1631 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1632 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1633 and (xml_data[2:4] != '\x00\x00'):
1635 sniffed_xml_encoding = 'utf-16be'
1636 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1637 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1639 sniffed_xml_encoding = 'utf-16le'
1640 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1641 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1642 (xml_data[2:4] != '\x00\x00'):
1644 sniffed_xml_encoding = 'utf-16le'
1645 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1646 elif xml_data[:4] == '\x00\x00\x00\x3c':
1648 sniffed_xml_encoding = 'utf-32be'
1649 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1650 elif xml_data[:4] == '\x3c\x00\x00\x00':
1652 sniffed_xml_encoding = 'utf-32le'
1653 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1654 elif xml_data[:4] == '\x00\x00\xfe\xff':
1656 sniffed_xml_encoding = 'utf-32be'
1657 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1658 elif xml_data[:4] == '\xff\xfe\x00\x00':
1660 sniffed_xml_encoding = 'utf-32le'
1661 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1662 elif xml_data[:3] == '\xef\xbb\xbf':
1664 sniffed_xml_encoding = 'utf-8'
1665 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1667 sniffed_xml_encoding = 'ascii'
1669 xml_encoding_match = re.compile \
1670 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1673 xml_encoding_match = None
1674 if xml_encoding_match:
1675 xml_encoding = xml_encoding_match.groups()[0].lower()
1676 if sniffed_xml_encoding and \
1677 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1678 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1679 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1681 xml_encoding = sniffed_xml_encoding
1682 return xml_data, xml_encoding, sniffed_xml_encoding
1685 def find_codec(self, charset):
1686 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1687 or (charset and self._codec(charset.replace("-", ""))) \
1688 or (charset and self._codec(charset.replace("-", "_"))) \
1691 def _codec(self, charset):
1692 if not charset: return charset
1695 codecs.lookup(charset)
1701 EBCDIC_TO_ASCII_MAP = None
1702 def _ebcdic_to_ascii(self, s):
1704 if not c.EBCDIC_TO_ASCII_MAP:
1705 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1706 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1707 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1708 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1709 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1710 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1711 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1712 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1713 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1714 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1715 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1716 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1717 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1718 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1719 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1720 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1721 250,251,252,253,254,255)
1723 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1724 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1725 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1727 MS_CHARS = { '\x80' : ('euro', '20AC'),
1729 '\x82' : ('sbquo', '201A'),
1730 '\x83' : ('fnof', '192'),
1731 '\x84' : ('bdquo', '201E'),
1732 '\x85' : ('hellip', '2026'),
1733 '\x86' : ('dagger', '2020'),
1734 '\x87' : ('Dagger', '2021'),
1735 '\x88' : ('circ', '2C6'),
1736 '\x89' : ('permil', '2030'),
1737 '\x8A' : ('Scaron', '160'),
1738 '\x8B' : ('lsaquo', '2039'),
1739 '\x8C' : ('OElig', '152'),
1741 '\x8E' : ('#x17D', '17D'),
1744 '\x91' : ('lsquo', '2018'),
1745 '\x92' : ('rsquo', '2019'),
1746 '\x93' : ('ldquo', '201C'),
1747 '\x94' : ('rdquo', '201D'),
1748 '\x95' : ('bull', '2022'),
1749 '\x96' : ('ndash', '2013'),
1750 '\x97' : ('mdash', '2014'),
1751 '\x98' : ('tilde', '2DC'),
1752 '\x99' : ('trade', '2122'),
1753 '\x9a' : ('scaron', '161'),
1754 '\x9b' : ('rsaquo', '203A'),
1755 '\x9c' : ('oelig', '153'),
1757 '\x9e' : ('#x17E', '17E'),
1758 '\x9f' : ('Yuml', ''),}
1760 #######################################################################
1763 #By default, act as an HTML pretty-printer.
1764 if __name__ == '__main__':
1766 soup = BeautifulSoup(sys.stdin.read())
1767 print soup.prettify()