diff options
author | Zhiming Wang <zmwangx@gmail.com> | 2015-07-05 23:35:08 -0700 |
---|---|---|
committer | Zhiming Wang <zmwangx@gmail.com> | 2015-07-05 23:35:08 -0700 |
commit | 8eeba026961719e5123195d354dacf75938dff62 (patch) | |
tree | 5c6437e36f1a4dd33d14f174de899d16bcef5025 /pyblog | |
parent | a3a4bfee3d1acf00fc35e9f48738d7e6adce6ca0 (diff) | |
download | my_new_personal_website-8eeba026961719e5123195d354dacf75938dff62.tar.xz my_new_personal_website-8eeba026961719e5123195d354dacf75938dff62.zip |
fixes for beautifulsoup4 4.4.0
* Explicitly specify the lxml parser to suppress warnings;
* Supposedly fix the
AttributeError: 'NoneType' object has no attribute 'next_element'
issue [1] with BeautifulSoup when finding elements after extracting
certain tags. I bet I've encountered that error before, but it seemed
to went away somehow. Not this time. Without digging into to much
detail (don't have time to file a bug or anything), this fix --
precompile a list of tags to extract, and only extract after done with
other processing -- seem to work.
* Add .nojekyll to root of build
dir (https://help.github.com/articles/files-that-start-with-an-underscore-are-missing/).
[1]: Full traceback:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-7-9d3d20b34e0c> in <module>()
----> 1 a.footer
/Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in __getattr__(self, tag)
995 # We special case contents to avoid recursion.
996 elif not tag.startswith("__") and not tag=="contents":
--> 997 return self.find(tag)
998 raise AttributeError(
999 "'%s' object has no attribute '%s'" % (self.__class__, tag))
/Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in find(self, name, attrs, recursive, text, **kwargs)
1232 criteria."""
1233 r = None
-> 1234 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1235 if l:
1236 r = l[0]
/Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in find_all(self, name, attrs, recursive, text, limit, **kwargs)
1253 if not recursive:
1254 generator = self.children
-> 1255 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1256 findAll = find_all # BS3
1257 findChildren = find_all # BS2
/Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in _find_all(self, name, attrs, text, limit, generator, **kwargs)
527 while True:
528 try:
--> 529 i = next(generator)
530 except StopIteration:
531 break
/Users/zmwang/.pyenv/versions/3.4.3/lib/python3.4/site-packages/beautifulsoup4-4.4.0-py3.4.egg/bs4/element.py in descendants(self)
1271 while current is not stopNode:
1272 yield current
-> 1273 current = current.next_element
1274
1275 # CSS selector code
AttributeError: 'NoneType' object has no attribute 'next_element'
Diffstat (limited to 'pyblog')
-rwxr-xr-x | pyblog | 30 |
1 files changed, 19 insertions, 11 deletions
@@ -381,7 +381,7 @@ def generate_sitemap(feed): # try to extract updated time updated = None with open(fullpath, encoding="utf-8") as htmlobj: - soup = bs4.BeautifulSoup(htmlobj.read()) + soup = bs4.BeautifulSoup(htmlobj.read(), "lxml") if soup.article.footer is not None: updated_tag = soup.article.footer.find(attrs={"class": "updated"}) if updated_tag is not None: @@ -480,7 +480,7 @@ def generate_index_and_feed(): item = RssItem() try: with open(htmlpath, encoding="utf-8") as htmlfile: - soup = bs4.BeautifulSoup(htmlfile.read()) + soup = bs4.BeautifulSoup(htmlfile.read(), "lxml") # generate atom entry entry.author = copy.deepcopy(feed.author) # assume it's always the same author @@ -500,21 +500,25 @@ def generate_index_and_feed(): entry.updated.text = entry.updated_datetime.isoformat() # process content - # extract the article content without header and footer + tags_to_remove = [] + # mark header and footer for removal article = soup.article if article.header is not None: - article.header.extract() + tags_to_remove.append(article.header) if article.footer is not None: - article.footer.extract() - # remove line numbers + tags_to_remove.append(article.footer) + # mark line numbers for removal for line_number_span in article.find_all("span", attrs={"class": "line-number"}): - line_number_span.extract() - # remove script tags + tags_to_remove.append(line_number_span) + # mark script tags for removal for script_tag in article.find_all("script"): - script_tag.extract() + tags_to_remove.append(script_tag) # make internal links absolute abosolutify_links(article, entry_url) + # remove marked tags + for tag in tags_to_remove: + tag.extract() entry.content_html = ''.join([str(content) for content in article.contents]) @@ -540,7 +544,7 @@ def generate_index_and_feed(): item.assemble_item() rss.items.append(item) except Exception: - sys.stderr.write("failed to generate feed entry from %s" % name) + sys.stderr.write("error: failed to generate feed entry from %s\n" % name) with open(htmlpath, encoding="utf-8") as htmlfile: sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read()) raise @@ -591,7 +595,7 @@ def _pre_tag_insert_line_numbers(soup, pre_tag): def number_code_lines(htmlfilepath): """Insert line numbers to preformatted code blocks.""" with open(htmlfilepath, "r+", encoding="utf-8") as htmlfileobj: - soup = bs4.BeautifulSoup(htmlfileobj.read()) + soup = bs4.BeautifulSoup(htmlfileobj.read(), "lxml") for pre_tag in soup.find_all("pre"): if ((pre_tag.code is None or "class" not in pre_tag.attrs or not "sourceCode" in pre_tag["class"])): @@ -650,6 +654,10 @@ def generate_blog(fresh=False, report_total_errors=True): else: os.remove(obj) + # nojekyll: https://help.github.com/articles/files-that-start-with-an-underscore-are-missing/ + with open(os.path.join(BUILDDIR, ".nojekyll"), "w") as fileobj: + pass + failed_builds = 0 template_mtime = os.path.getmtime(HTMLTEMPLATE) anything_modified = False |