From 8eeba026961719e5123195d354dacf75938dff62 Mon Sep 17 00:00:00 2001
From: Zhiming Wang <zmwangx@gmail.com>
Date: Sun, 5 Jul 2015 23:35:08 -0700
Subject: fixes for beautifulsoup4 4.4.0

* Explicitly specify the lxml parser to suppress warnings;

* Supposedly fix the

      AttributeError: 'NoneType' object has no attribute 'next_element'

  issue [1] with BeautifulSoup when finding elements after extracting
  certain tags. I bet I've encountered that error before, but it seemed
  to went away somehow. Not this time. Without digging into to much
  detail (don't have time to file a bug or anything), this fix --
  precompile a list of tags to extract, and only extract after done with
  other processing -- seem to work.

* Add .nojekyll to root of build
  dir (https://help.github.com/articles/files-that-start-with-an-underscore-are-missing/).

[1]: Full traceback:

    ---------------------------------------------------------------------------
    AttributeError                            Traceback (most recent call last)
    <ipython-input-7-9d3d20b34e0c> in <module>()
    ----> 1 a.footer

    /Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in __getattr__(self, tag)
        995         # We special case contents to avoid recursion.
        996         elif not tag.startswith("__") and not tag=="contents":
    --> 997             return self.find(tag)
        998         raise AttributeError(
        999             "'%s' object has no attribute '%s'" % (self.__class__, tag))

    /Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in find(self, name, attrs, recursive, text, **kwargs)
       1232         criteria."""
       1233         r = None
    -> 1234         l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
       1235         if l:
       1236             r = l[0]

    /Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in find_all(self, name, attrs, recursive, text, limit, **kwargs)
       1253         if not recursive:
       1254             generator = self.children
    -> 1255         return self._find_all(name, attrs, text, limit, generator, **kwargs)
       1256     findAll = find_all       # BS3
       1257     findChildren = find_all  # BS2

    /Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in _find_all(self, name, attrs, text, limit, generator, **kwargs)
        527         while True:
        528             try:
    --> 529                 i = next(generator)
        530             except StopIteration:
        531                 break

    /Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in descendants(self)
       1271         while current is not stopNode:
       1272             yield current
    -> 1273             current = current.next_element
       1274
       1275     # CSS selector code

    AttributeError: 'NoneType' object has no attribute 'next_element'
---
 pyblog | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'pyblog')
diff --git a/pyblog b/pyblog
index 8b500a03..8817d5e4 100755
--- a/pyblog
+++ b/pyblog
@@ -381,7 +381,7 @@ def generate_sitemap(feed):
         # try to extract updated time
         updated = None
         with open(fullpath, encoding="utf-8") as htmlobj:
-            soup = bs4.BeautifulSoup(htmlobj.read())
+            soup = bs4.BeautifulSoup(htmlobj.read(), "lxml")
             if soup.article.footer is not None:
                 updated_tag = soup.article.footer.find(attrs={"class": "updated"})
                 if updated_tag is not None:
@@ -480,7 +480,7 @@ def generate_index_and_feed():
             item = RssItem()
             try:
                 with open(htmlpath, encoding="utf-8") as htmlfile:
-                    soup = bs4.BeautifulSoup(htmlfile.read())
+                    soup = bs4.BeautifulSoup(htmlfile.read(), "lxml")
 
                     # generate atom entry
                     entry.author = copy.deepcopy(feed.author)  # assume it's always the same author
@@ -500,21 +500,25 @@ def generate_index_and_feed():
                     entry.updated.text = entry.updated_datetime.isoformat()
 
                     # process content
-                    # extract the article content without header and footer
+                    tags_to_remove = []
+                    # mark header and footer for removal
                     article = soup.article
                     if article.header is not None:
-                        article.header.extract()
+                        tags_to_remove.append(article.header)
                     if article.footer is not None:
-                        article.footer.extract()
-                    # remove line numbers
+                        tags_to_remove.append(article.footer)
+                    # mark line numbers for removal
                     for line_number_span in article.find_all("span",
                                                              attrs={"class": "line-number"}):
-                        line_number_span.extract()
-                    # remove script tags
+                        tags_to_remove.append(line_number_span)
+                    # mark script tags for removal
                     for script_tag in article.find_all("script"):
-                        script_tag.extract()
+                        tags_to_remove.append(script_tag)
                     # make internal links absolute
                     abosolutify_links(article, entry_url)
+                    # remove marked tags
+                    for tag in tags_to_remove:
+                        tag.extract()
 
                     entry.content_html = ''.join([str(content)
                                                   for content in article.contents])
@@ -540,7 +544,7 @@ def generate_index_and_feed():
                     item.assemble_item()
                     rss.items.append(item)
             except Exception:
-                sys.stderr.write("failed to generate feed entry from %s" % name)
+                sys.stderr.write("error: failed to generate feed entry from %s\n" % name)
                 with open(htmlpath, encoding="utf-8") as htmlfile:
                     sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read())
                 raise
@@ -591,7 +595,7 @@ def _pre_tag_insert_line_numbers(soup, pre_tag):
 def number_code_lines(htmlfilepath):
     """Insert line numbers to preformatted code blocks."""
     with open(htmlfilepath, "r+", encoding="utf-8") as htmlfileobj:
-        soup = bs4.BeautifulSoup(htmlfileobj.read())
+        soup = bs4.BeautifulSoup(htmlfileobj.read(), "lxml")
         for pre_tag in soup.find_all("pre"):
             if ((pre_tag.code is None or "class" not in pre_tag.attrs or
                  not "sourceCode" in pre_tag["class"])):
@@ -650,6 +654,10 @@ def generate_blog(fresh=False, report_total_errors=True):
             else:
                 os.remove(obj)
 
+    # nojekyll: https://help.github.com/articles/files-that-start-with-an-underscore-are-missing/
+    with open(os.path.join(BUILDDIR, ".nojekyll"), "w") as fileobj:
+        pass
+
     failed_builds = 0
     template_mtime = os.path.getmtime(HTMLTEMPLATE)
     anything_modified = False
-- 
cgit v1.2.1