diff options
Diffstat (limited to '')
-rw-r--r-- | generators/generators.py | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/generators/generators.py b/generators/generators.py index 9ad773c2..adfe819a 100644 --- a/generators/generators.py +++ b/generators/generators.py @@ -13,6 +13,13 @@ import subprocess import lxml.etree as ET +import urllib.parse +import copy +import email.utils +import time + +from rss import * + from config.config import * from utils import utils @@ -517,3 +524,168 @@ def rewrite_title(): indexmd.write(re.sub(line, string, line)) else: indexmd.write(line) + + +def generate_index_and_feed(): + """Generate index.html and feeds (atom and rss).""" + # pylint: disable=too-many-statements,attribute-defined-outside-init,invalid-name + sys.stderr.write("generating atom and rss feeds\n") + # initialize atom feed + feed = AtomFeed() + feed.author = ET.fromstring( + "<author>" + "<name>{author}</name>" + "<uri>{home}</uri>" + "<email>{email}</email>" + "</author>".format(author=AUTHOR, home=BLOG_HOME, email=AUTHOR_EMAIL)) + feed.generator = ET.Element("generator", uri=GENERATOR_HOME_PAGE) + feed.generator.text = GENERATOR_NAME + if ATOM_ICON_PATH is not None: + feed.icon = ET.Element("icon") + feed.icon.text = urllib.parse.urljoin(BLOG_HOME, ATOM_ICON_PATH) + feed.id_text = BLOG_HOME + feed.id = ET.Element("id") + feed.id.text = feed.id_text + feed.links = [ + ET.Element("link", href=urllib.parse.urljoin(BLOG_HOME, "atom.xml"), rel="self", + type="application/atom+xml"), + ET.Element("link", href=BLOG_HOME, rel="alternate", + type="text/html"), + ] + feed.title_text = BLOG_TITLE + feed.title = ET.fromstring("<title>{title}</title>".format(title=BLOG_TITLE)) + feed.subtitle_text = BLOG_DESCRIPTION + feed.subtitle = ET.fromstring("<subtitle>{subtitle}</subtitle>" + .format(subtitle=BLOG_DESCRIPTION)) + # initialize rss feed + rss = RssFeed() + rss.rssurl = urllib.parse.urljoin(BLOG_HOME, "rss.xml") + rss.title = ET.Element("title") + rss.title.text = BLOG_TITLE + rss.link = ET.Element("link") + rss.link.text = BLOG_HOME + rss.description = ET.Element("description") + rss.description.text = BLOG_DESCRIPTION + rss.language = ET.Element("language") + rss.language.text = LANGUAGE + rss.author_text = "{email} ({name})".format(email=AUTHOR_EMAIL, name=AUTHOR) + rss.managingEditor = ET.Element("managingEditor") + rss.managingEditor.text = rss.author_text + rss.webMaster = ET.Element("webMaster") + rss.webMaster.text = rss.author_text + rss.generator = ET.Element("generator") + rss.generator.text = "{generator} ({url})".format(generator=GENERATOR_NAME, + url=GENERATOR_HOME_PAGE) + rss.image = ET.Element("image") + if RSS_ICON_PATH is not None: + ET.SubElement(rss.image, "url").text = urllib.parse.urljoin(BLOG_HOME, RSS_ICON_PATH) + rss.image.append(copy.deepcopy(rss.title)) + rss.image.append(copy.deepcopy(rss.link)) + ET.SubElement(rss.image, "width").text = str(RSS_ICON_WIDTH) + ET.SubElement(rss.image, "height").text = str(RSS_ICON_HEIGHT) + + # update times will be set after everthing finishes + + for name in os.listdir(os.path.join(BUILDDIR, "blog")): + if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}.*\.html", name): + htmlpath = os.path.join(BUILDDIR, "blog", name) + entry = AtomEntry() + item = RssItem() + try: + with open(htmlpath, encoding="utf-8") as htmlfile: + soup = bs4.BeautifulSoup(htmlfile.read(), "lxml") + + # generate atom entry + entry.author = copy.deepcopy(feed.author) # assume it's always the same author + entry_url = urllib.parse.urljoin(BLOG_HOME, "blog/%s" % name) + entry.id_text = entry_url + entry.id = ET.Element("id") + entry.id.text = entry_url + entry.relpath = "/blog/%s" % name + entry.link = ET.Element("link", href=entry_url) + entry.title_text = soup.title.text + entry.title = ET.Element("title", type="html") + entry.title.text = entry.title_text + post_date = soup.find("meta", attrs={"name": "date"})["content"] + entry.updated_datetime = dateutil.parser.parse(post_date) + entry.updated = ET.Element("updated") + # pylint: disable=no-member + entry.updated.text = entry.updated_datetime.isoformat() + + # process content + tags_to_remove = [] + # mark header and footer for removal + article = soup.article + if article.header is not None: + tags_to_remove.append(article.header) + # mark line numbers for removal + for line_number_span in article.find_all("span", + attrs={"class": "line-number"}): + tags_to_remove.append(line_number_span) + # mark script tags for removal + for script_tag in article.find_all("script"): + tags_to_remove.append(script_tag) + # make internal links absolute + utils.absolutify_links(article, entry_url) + # remove marked tags + for tag in tags_to_remove: + tag.extract() + + entry.content_html = ''.join([str(content) + for content in article.contents]) + entry.content = ET.Element("content", type="html") + entry.content.text = ET.CDATA(entry.content_html) + entry.assemble_entry() + feed.entries.append(entry) + + # generate rss item + item.title = ET.Element("title") + item.title.text = entry.title_text + item.link = ET.Element("link") + item.link.text = entry_url + item.description = ET.Element("description") + item.description.text = ET.CDATA(entry.content_html) + item.author = ET.Element("author") + item.author.text = rss.author_text + item.guid = ET.Element("guid", isPermaLink="true") + item.guid.text = item.link.text + item.timestamp = entry.updated_datetime.timestamp() + item.pubDate = ET.Element("pubDate") + item.pubDate.text = email.utils.formatdate(item.timestamp, usegmt=True) + item.assemble_item() + rss.items.append(item) + except Exception: + sys.stderr.write("error: failed to generate feed entry from %s\n" % name) + with open(htmlpath, encoding="utf-8") as htmlfile: + sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read()) + raise + # sort entries by reverse chronological order + feed.entries.sort(key=lambda entry: entry.updated_datetime, reverse=True) + rss.items.sort(key=lambda item: item.timestamp, reverse=True) + + generate_index(feed) + generate_menu() + generate_table() + generate_blog_list(feed) + generate_notes_list() + rewrite_title() + + feed.updated_datetime = utils.current_datetime() + feed.updated = ET.Element("updated") + feed.updated.text = feed.updated_datetime.isoformat() + + rss.update_timestamp = time.time() + rss.pubDate = ET.Element("pubDate") + rss.pubDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) + rss.lastBuildDate = ET.Element("lastBuildDate") + rss.lastBuildDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True) + + with open(ATOM, "w", encoding="utf-8") as atom: + atom.write("%s\n" % feed.dump_feed(FEED_MAX_ENTRIES)) + sys.stderr.write("wrote atom.xml\n") + + with open(RSS, "w", encoding="utf-8") as rssxml: + rssxml.write("%s\n" % rss.dump_rss(FEED_MAX_ENTRIES)) + sys.stderr.write("wrote rss.xml\n") + + generate_sitemap(feed) |