diff options
-rw-r--r-- | generators/generators.py | 172 | ||||
-rwxr-xr-x | pyblog | 168 |
2 files changed, 173 insertions, 167 deletions
diff --git a/generators/generators.py b/generators/generators.py index 9ad773c2..adfe819a 100644 --- a/generators/generators.py +++ b/generators/generators.py @@ -13,6 +13,13 @@ import subprocess import lxml.etree as ET +import urllib.parse +import copy +import email.utils +import time + +from rss import * + from config.config import * from utils import utils @@ -517,3 +524,168 @@ def rewrite_title(): indexmd.write(re.sub(line, string, line)) else: indexmd.write(line) + + +def generate_index_and_feed(): + """Generate index.html and feeds (atom and rss).""" + # pylint: disable=too-many-statements,attribute-defined-outside-init,invalid-name + sys.stderr.write("generating atom and rss feeds\n") + # initialize atom feed + feed = AtomFeed() + feed.author = ET.fromstring( + "<author>" + "<name>{author}</name>" + "<uri>{home}</uri>" + "<email>{email}</email>" + "</author>".format(author=AUTHOR, home=BLOG_HOME, email=AUTHOR_EMAIL)) + feed.generator = ET.Element("generator", uri=GENERATOR_HOME_PAGE) + feed.generator.text = GENERATOR_NAME + if ATOM_ICON_PATH is not None: + feed.icon = ET.Element("icon") + feed.icon.text = urllib.parse.urljoin(BLOG_HOME, ATOM_ICON_PATH) + feed.id_text = BLOG_HOME + feed.id = ET.Element("id") + feed.id.text = feed.id_text + feed.links = [ + ET.Element("link", href=urllib.parse.urljoin(BLOG_HOME, "atom.xml"), rel="self", + type="application/atom+xml"), + ET.Element("link", href=BLOG_HOME, rel="alternate", + type="text/html"), + ] + feed.title_text = BLOG_TITLE + feed.title = ET.fromstring("<title>{title}</title>".format(title=BLOG_TITLE)) + feed.subtitle_text = BLOG_DESCRIPTION + feed.subtitle = ET.fromstring("<subtitle>{subtitle}</subtitle>" + .format(subtitle=BLOG_DESCRIPTION)) + # initialize rss feed + rss = RssFeed() + rss.rssurl = urllib.parse.urljoin(BLOG_HOME, "rss.xml") + rss.title = ET.Element("title") + rss.title.text = BLOG_TITLE + rss.link = ET.Element("link") + rss.link.text = BLOG_HOME + rss.description = ET.Element("description") + rss.description.text = BLOG_DESCRIPTION + rss.language = ET.Element("language") + rss.language.text = LANGUAGE + rss.author_text = "{email} ({name})".format(email=AUTHOR_EMAIL, name=AUTHOR) + rss.managingEditor = ET.Element("managingEditor") + rss.managingEditor.text = rss.author_text + rss.webMaster = ET.Element("webMaster") + rss.webMaster.text = rss.author_text + rss.generator = ET.Element("generator") + rss.generator.text = "{generator} ({url})".format(generator=GENERATOR_NAME, + url=GENERATOR_HOME_PAGE) + rss.image = ET.Element("image") + if RSS_ICON_PATH is not None: + ET.SubElement(rss.image, "url").text = urllib.parse.urljoin(BLOG_HOME, RSS_ICON_PATH) + rss.image.append(copy.deepcopy(rss.title)) + rss.image.append(copy.deepcopy(rss.link)) + ET.SubElement(rss.image, "width").text = str(RSS_ICON_WIDTH) + ET.SubElement(rss.image, "height").text = str(RSS_ICON_HEIGHT) + + # update times will be set after everthing finishes + + for name in os.listdir(os.path.join(BUILDDIR, "blog")): + if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}.*\.html", name): + htmlpath = os.path.join(BUILDDIR, "blog", name) + entry = AtomEntry() + item = RssItem() + try: + with open(htmlpath, encoding="utf-8") as htmlfile: + soup = bs4.BeautifulSoup(htmlfile.read(), "lxml") + + # generate atom entry + entry.author = copy.deepcopy(feed.author) # assume it's always the same author + entry_url = urllib.parse.urljoin(BLOG_HOME, "blog/%s" % name) + entry.id_text = entry_url + entry.id = ET.Element("id") + entry.id.text = entry_url + entry.relpath = "/blog/%s" % name + entry.link = ET.Element("link", href=entry_url) + entry.title_text = soup.title.text + entry.title = ET.Element("title", type="html") + entry.title.text = entry.title_text + post_date = soup.find("meta", attrs={"name": "date"})["content"] + entry.updated_datetime = dateutil.parser.parse(post_date) + entry.updated = ET.Element("updated") + # pylint: disable=no-member + entry.updated.text = entry.updated_datetime.isoformat() + + # process content + tags_to_remove = [] + # mark header and footer for removal + article = soup.article + if article.header is not None: + tags_to_remove.append(article.header) + # mark line numbers for removal + for line_number_span in article.find_all("span", + attrs={"class": "line-number"}): + tags_to_remove.append(line_number_span) + # mark script tags for removal + for script_tag in article.find_all("script"): + tags_to_remove.append(script_tag) + # make internal links absolute + utils.absolutify_links(article, entry_url) + # remove marked tags + for tag in tags_to_remove: + tag.extract() + + entry.content_html = ''.join([str(content) + for content in article.contents]) + entry.content = ET.Element("content", type="html") + entry.content.text = ET.CDATA(entry.content_html) + entry.assemble_entry() + feed.entries.append(entry) + + # generate rss item + item.title = ET.Element("title") + item.title.text = entry.title_text + item.link = ET.Element("link") + item.link.text = entry_url + item.description = ET.Element("description") + item.description.text = ET.CDATA(entry.content_html) + item.author = ET.Element("author") + item.author.text = rss.author_text + item.guid = ET.Element("guid", isPermaLink="true") + item.guid.text = item.link.text + item.timestamp = entry.updated_datetime.timestamp() + item.pubDate = ET.Element("pubDate") + item.pubDate.text = email.utils.formatdate(item.timestamp, usegmt=True) + item.assemble_item() + rss.items.append(item) + except Exception: + sys.stderr.write("error: failed to generate feed entry from %s\n" % name) + with open(htmlpath, encoding="utf-8") as htmlfile: + sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read()) + raise + # sort entries by reverse chronological order + feed.entries.sort(key=lambda entry: entry.updated_datetime, reverse=True) + rss.items.sort(key=lambda item: item.timestamp, reverse=True) + + generate_index(feed) + generate_menu() + generate_table() + generate_blog_list(feed) + generate_notes_list() + rewrite_title() + + feed.updated_datetime = utils.current_datetime() + feed.updated = ET.Element("updated") + feed.updated.text = feed.updated_datetime.isoformat() + + rss.update_timestamp = time.time() + rss.pubDate = ET.Element("pubDate") + rss.pubDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) + rss.lastBuildDate = ET.Element("lastBuildDate") + rss.lastBuildDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) + + with open(ATOM, "w", encoding="utf-8") as atom: + atom.write("%s\n" % feed.dump_feed(FEED_MAX_ENTRIES)) + sys.stderr.write("wrote atom.xml\n") + + with open(RSS, "w", encoding="utf-8") as rssxml: + rssxml.write("%s\n" % rss.dump_rss(FEED_MAX_ENTRIES)) + sys.stderr.write("wrote rss.xml\n") + + generate_sitemap(feed) @@ -51,172 +51,6 @@ from config.config import * from generators import generators - -def generate_index_and_feed(): - """Generate index.html and feeds (atom and rss).""" - # pylint: disable=too-many-statements,attribute-defined-outside-init,invalid-name - sys.stderr.write("generating atom and rss feeds\n") - # initialize atom feed - feed = AtomFeed() - feed.author = ET.fromstring( - "<author>" - "<name>{author}</name>" - "<uri>{home}</uri>" - "<email>{email}</email>" - "</author>".format(author=AUTHOR, home=BLOG_HOME, email=AUTHOR_EMAIL)) - feed.generator = ET.Element("generator", uri=GENERATOR_HOME_PAGE) - feed.generator.text = GENERATOR_NAME - if ATOM_ICON_PATH is not None: - feed.icon = ET.Element("icon") - feed.icon.text = urllib.parse.urljoin(BLOG_HOME, ATOM_ICON_PATH) - feed.id_text = BLOG_HOME - feed.id = ET.Element("id") - feed.id.text = feed.id_text - feed.links = [ - ET.Element("link", href=urllib.parse.urljoin(BLOG_HOME, "atom.xml"), rel="self", - type="application/atom+xml"), - ET.Element("link", href=BLOG_HOME, rel="alternate", - type="text/html"), - ] - feed.title_text = BLOG_TITLE - feed.title = ET.fromstring("<title>{title}</title>".format(title=BLOG_TITLE)) - feed.subtitle_text = BLOG_DESCRIPTION - feed.subtitle = ET.fromstring("<subtitle>{subtitle}</subtitle>" - .format(subtitle=BLOG_DESCRIPTION)) - # initialize rss feed - rss = RssFeed() - rss.rssurl = urllib.parse.urljoin(BLOG_HOME, "rss.xml") - rss.title = ET.Element("title") - rss.title.text = BLOG_TITLE - rss.link = ET.Element("link") - rss.link.text = BLOG_HOME - rss.description = ET.Element("description") - rss.description.text = BLOG_DESCRIPTION - rss.language = ET.Element("language") - rss.language.text = LANGUAGE - rss.author_text = "{email} ({name})".format(email=AUTHOR_EMAIL, name=AUTHOR) - rss.managingEditor = ET.Element("managingEditor") - rss.managingEditor.text = rss.author_text - rss.webMaster = ET.Element("webMaster") - rss.webMaster.text = rss.author_text - rss.generator = ET.Element("generator") - rss.generator.text = "{generator} ({url})".format(generator=GENERATOR_NAME, - url=GENERATOR_HOME_PAGE) - rss.image = ET.Element("image") - if RSS_ICON_PATH is not None: - ET.SubElement(rss.image, "url").text = urllib.parse.urljoin(BLOG_HOME, RSS_ICON_PATH) - rss.image.append(copy.deepcopy(rss.title)) - rss.image.append(copy.deepcopy(rss.link)) - ET.SubElement(rss.image, "width").text = str(RSS_ICON_WIDTH) - ET.SubElement(rss.image, "height").text = str(RSS_ICON_HEIGHT) - - # update times will be set after everthing finishes - - for name in os.listdir(os.path.join(BUILDDIR, "blog")): - if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}.*\.html", name): - htmlpath = os.path.join(BUILDDIR, "blog", name) - entry = AtomEntry() - item = RssItem() - try: - with open(htmlpath, encoding="utf-8") as htmlfile: - soup = bs4.BeautifulSoup(htmlfile.read(), "lxml") - - # generate atom entry - entry.author = copy.deepcopy(feed.author) # assume it's always the same author - entry_url = urllib.parse.urljoin(BLOG_HOME, "blog/%s" % name) - entry.id_text = entry_url - entry.id = ET.Element("id") - entry.id.text = entry_url - entry.relpath = "/blog/%s" % name - entry.link = ET.Element("link", href=entry_url) - entry.title_text = soup.title.text - entry.title = ET.Element("title", type="html") - entry.title.text = entry.title_text - post_date = soup.find("meta", attrs={"name": "date"})["content"] - entry.updated_datetime = dateutil.parser.parse(post_date) - entry.updated = ET.Element("updated") - # pylint: disable=no-member - entry.updated.text = entry.updated_datetime.isoformat() - - # process content - tags_to_remove = [] - # mark header and footer for removal - article = soup.article - if article.header is not None: - tags_to_remove.append(article.header) - # mark line numbers for removal - for line_number_span in article.find_all("span", - attrs={"class": "line-number"}): - tags_to_remove.append(line_number_span) - # mark script tags for removal - for script_tag in article.find_all("script"): - tags_to_remove.append(script_tag) - # make internal links absolute - utils.absolutify_links(article, entry_url) - # remove marked tags - for tag in tags_to_remove: - tag.extract() - - entry.content_html = ''.join([str(content) - for content in article.contents]) - entry.content = ET.Element("content", type="html") - entry.content.text = ET.CDATA(entry.content_html) - entry.assemble_entry() - feed.entries.append(entry) - - # generate rss item - item.title = ET.Element("title") - item.title.text = entry.title_text - item.link = ET.Element("link") - item.link.text = entry_url - item.description = ET.Element("description") - item.description.text = ET.CDATA(entry.content_html) - item.author = ET.Element("author") - item.author.text = rss.author_text - item.guid = ET.Element("guid", isPermaLink="true") - item.guid.text = item.link.text - item.timestamp = entry.updated_datetime.timestamp() - item.pubDate = ET.Element("pubDate") - item.pubDate.text = email.utils.formatdate(item.timestamp, usegmt=True) - item.assemble_item() - rss.items.append(item) - except Exception: - sys.stderr.write("error: failed to generate feed entry from %s\n" % name) - with open(htmlpath, encoding="utf-8") as htmlfile: - sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read()) - raise - # sort entries by reverse chronological order - feed.entries.sort(key=lambda entry: entry.updated_datetime, reverse=True) - rss.items.sort(key=lambda item: item.timestamp, reverse=True) - - generators.generate_index(feed) - generators.generate_menu() - generators.generate_table() - generators.generate_blog_list(feed) - generators.generate_notes_list() - generators.rewrite_title() - - feed.updated_datetime = utils.current_datetime() - feed.updated = ET.Element("updated") - feed.updated.text = feed.updated_datetime.isoformat() - - rss.update_timestamp = time.time() - rss.pubDate = ET.Element("pubDate") - rss.pubDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) - rss.lastBuildDate = ET.Element("lastBuildDate") - rss.lastBuildDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) - - with open(ATOM, "w", encoding="utf-8") as atom: - atom.write("%s\n" % feed.dump_feed(FEED_MAX_ENTRIES)) - sys.stderr.write("wrote atom.xml\n") - - with open(RSS, "w", encoding="utf-8") as rssxml: - rssxml.write("%s\n" % rss.dump_rss(FEED_MAX_ENTRIES)) - sys.stderr.write("wrote rss.xml\n") - - generators.generate_sitemap(feed) - - # exclude_list is only inialized once to avoid constant disk IO @utils.static_vars(exclude_list=None) def generate_blog(fresh=False, report_total_errors=True): @@ -347,7 +181,7 @@ def generate_blog(fresh=False, report_total_errors=True): utils.postprocess_html_file(dstpath) if anything_modified: - generate_index_and_feed() + generators.generate_index_and_feed() sys.stderr.write("done\n") if report_total_errors: |