diff options
author | Zhiming Wang <zmwangx@gmail.com> | 2015-05-05 00:08:53 -0700 |
---|---|---|
committer | Zhiming Wang <zmwangx@gmail.com> | 2015-05-05 00:08:53 -0700 |
commit | d14e9ac5b86c911cb255ab30425790488c20fb4d (patch) | |
tree | 7a14faa9dd1e513850f33ced91d14729c3cb36e7 /pyblog | |
parent | 07bf43a314fe65ccd9c7cb663c3c6134a47cc269 (diff) | |
download | my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.tar.xz my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.zip |
a lot of work
Mainly generating feed and index.
Diffstat (limited to 'pyblog')
-rwxr-xr-x | pyblog | 249 |
1 files changed, 242 insertions, 7 deletions
@@ -1,21 +1,138 @@ #!/usr/bin/env python3 +# TODO: timestamp to ISO + """A simple blog generator with Pandoc as backend.""" import argparse +import datetime +import io import os import re import shutil import subprocess import sys +import tempfile +import time +import xml.etree.ElementTree as ET + +import bs4 +import dateutil.parser +import dateutil.tz ROOTDIR = os.path.dirname(os.path.realpath(__file__)) SOURCEDIR = os.path.join(ROOTDIR, "source") -INDEX = os.path.join(SOURCEDIR, "index.md") +INDEXMD = os.path.join(SOURCEDIR, "index.md") TEMPLATEDIR = os.path.join(ROOTDIR, "templates") HTMLTEMPLATE = os.path.join(TEMPLATEDIR, "template.html") BUILDDIR = os.path.join(ROOTDIR, "build") +ATOM = os.path.join(BUILDDIR, "atom.xml") +INDEXHTML = os.path.join(BUILDDIR, "index.html") + +FEED_MAX_ENTRIES = 20 + + +# Hack ET to support CDATA. +# XML suuuuuucks. +# http://stackoverflow.com/a/30019607/1944784 + +def CDATA(text=None): + element = ET.Element('![CDATA[') + element.text = text + return element + +ET._original_serialize_xml = ET._serialize_xml + +def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs): + + if elem.tag == '![CDATA[': + write("\n<{}{}]]>\n".format(elem.tag, elem.text)) + if elem.tail: + write(_escape_cdata(elem.tail)) + else: + return ET._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs) + +ET._serialize_xml = ET._serialize['xml'] = _serialize_xml + + +class AtomFeed(object): + """Class for storing atom:feed date and metadata.""" + + def __init__(self): + """Define available attributes.""" + self.author = None # atom:author + self.generator = None # atom:generator, optional + self.icon = None # atom:icon, optional + self.logo = None # atom:logo, optional + self.id_text = None # atom:id, just use URI + self.id = None # atom:id + self.links = [] # list of atom:link + self.title = None # atom:title + self.updated_datetime = None # update time as a datetime object + self.updated = None # atom:updated + self.entries = [] # list of atom:entry, in reverse time order + self.feed = None # atom:feed, assembled + + def assemble_feed(self): + """Assemble atom:feed.""" + self.feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom") + self.feed.append(self.title) + for link in self.links: + self.feed.append(link) + self.feed.append(self.updated) + self.feed.append(self.id) + self.feed.append(self.author) + if self.icon is not None: + self.feed.append(self.icon) + if self.logo is not None: + self.feed.append(self.icon) + if self.generator is not None: + self.feed.append(self.generator) + # include at most FEED_MAX_ENTRIES entries in the feed + for entry in self.entries[:FEED_MAX_ENTRIES]: + self.feed.append(entry.entry) + + def dump_feed(self): + """Dump atom:feed XML.""" + if self.feed is None: + self.assemble_feed() + return ET.tostring(self.feed).decode('utf-8') + + +class AtomEntry(object): + """Class for storing atom:entry data and metadata.""" + + def __init__(self): + """Define available attributes.""" + self.author = None # atom:author + self.id_text = None # atom:id, just use URI + self.id = None # atom:id + self.relpath = None # HTML page path relative to home + self.link = None # atom:link + self.title_text = None # plain text title + self.title = None # atom:title + self.updated_datetime = None # update time as a datetime object + self.updated = None # atom:updated + self.content_html = None # content as HTML markup + self.content = None # atom:content + self.entry = None # atom:entry, assembled + + def assemble_entry(self): + """Assemble atom:entry.""" + self.entry = ET.Element("entry") + self.entry.append(self.title) + self.entry.append(self.link) + self.entry.append(self.updated) + self.entry.append(self.id) + self.entry.append(self.author) + self.entry.append(self.content) + + def dump_entry(self): + """Dump atom:entry XML.""" + if self.entry is None: + self.assemble_entry() + return ET.tostring(self.entry).decode('utf-8') # TODO: @@ -23,9 +140,118 @@ def new_post(): pass -# TODO: -def generate_index(): - pass +def generate_index(feed): + """Generate index.html from index.md and a TOC.""" + + sys.stderr.write("generating index.html\n") + + # generate TOC + tocbuff = io.StringIO() + tocbuff.write('<div class="indextoc" id="toc">') + year = 10000 # will be larger than the latest year for quite a while + # recall that entries are in reverse chronological order + for entry in feed.entries: + date = entry.updated_datetime + if date.year < year: + # write a new <h2 class="toc"> tag with the smaller year + year = date.year + tocbuff.write(u'\n<h2 class="toc" id="{0}" datetime="{0}">{0}</h2>\n\n'.format(year)) + + # write a new <li> entry (<ul>) in Markdown, in the format: + # * <time class="tocdate" datetime="2015-05-05T00:06:04-0700">May 5</time> + # [Blah blah](/blog/2015-05-04-blah-blah.html) + monthday = date.strftime("%B %d") + tocbuff.write(u'* <time class="tocdate" datetime="%s">%s</time> [%s](%s)\n' % + (date.isoformat(), monthday, entry.title_text, entry.relpath)) + tocbuff.write('</div>') + + # create tempfile with index.md and the TOC concatenated, and generate index.html from that + fd, tmppath = tempfile.mkstemp() + os.close(fd) + with open(tmppath, 'w', encoding='utf-8') as tmpfile: + if os.path.exists(INDEXMD): + with open(INDEXMD, 'r', encoding='utf-8') as indexmd: + tmpfile.write(u"%s\n\n<hr>\n\n" % indexmd.read()) + tmpfile.write("%s\n" % tocbuff.getvalue()) + tocbuff.close() + + pandoc_args = [ + "pandoc", tmppath, + "--template", HTMLTEMPLATE, + "--highlight-style=pygments", + "-o", INDEXHTML, + ] + try: + subprocess.check_call(pandoc_args) + except subprocess.CalledProcessError: + failed_builds += 1 + sys.stderr.write("error: failed to generate index.html\n") + os.remove(tmppath) + + +def generate_index_and_feed(): + """Generate index.html and atom feed.""" + sys.stderr.write("generating atom feed\n") + # initialize feed + feed = AtomFeed() + # TODO: Put hard-coded values in a config file + feed.author = ET.fromstring('<author><name>Zhiming Wang</name><uri>https://github.com/zmwangx</uri><email>zmwangx@gmail.com</email></author>') + feed.generator = ET.Element("generator", uri="https://github.com/zmwangx/zmwangx.github.io") + feed.generator.text = "pyblog" + # TODO: feed.icon + feed.id_text = "http://zmwangx.github.io" + feed.id = ET.Element("id") + feed.id.text = feed.id_text + feed.links = [ + ET.Element("link", href="http://zmwangx.github.io/atom.xml", rel="self"), + ET.Element("link", href="http://zmwangx.github.io/"), + ] + feed.title_text = "dl? cmplnts?" + feed.title = ET.fromstring("<title>%s</title>" % feed.title_text) + # update time will be set after everthing finishes + + postspath = os.path.join(BUILDDIR, "blog") + # traverse all posts in reverse time order + for name in sorted(os.listdir(postspath), reverse=True): + if re.match(r"^(\d{4})-(\d{2})-(\d{2}).*\.html", name): + htmlpath = os.path.join(postspath, name) + entry = AtomEntry() + with open(htmlpath, encoding="utf-8") as htmlfile: + soup = bs4.BeautifulSoup(htmlfile.read()) + entry.author = feed.author # assume it's always the same author + entry.id_text = "%s/blog/%s" % (feed.id_text, name) + entry.id = ET.Element("id") + entry.id.text = entry.id_text + entry.relpath = "/blog/%s" % name + entry.link = ET.Element("link", href=entry.id_text) + entry.title_text = soup.title.text + entry.title = ET.Element("title", type="html") + entry.title.text = entry.title_text + post_date = soup.find("meta", attrs={"name": "date"})["content"] + entry.updated_datetime = dateutil.parser.parse(post_date) + entry.updated = ET.Element("updated") + entry.updated.text = entry.updated_datetime.isoformat() + # extract the article content without header and footer + article = soup.article + article.header.extract() + article.footer.extract() + entry.content_html = ''.join([str(content) + for content in article.contents]) + entry.content = ET.Element("content", type="html") + entry.content.append(CDATA(entry.content_html)) + entry.assemble_entry() + feed.entries.append(entry) + + generate_index(feed) + + feed.updated_datetime = datetime.datetime.fromtimestamp(round(time.time()), + dateutil.tz.tzlocal()) + feed.updated = ET.Element("updated") + feed.updated.text = feed.updated_datetime.isoformat() + + with open(ATOM, 'w', encoding='utf-8') as atom: + atom.write("%s\n" % feed.dump_feed()) + sys.stderr.write("wrote atom.xml\n") def generate(fresh=False): @@ -67,6 +293,8 @@ def generate(fresh=False): os.remove(obj) failed_builds = 0 + template_mtime = os.path.getmtime(HTMLTEMPLATE) + anything_modified = False for root, _, files in os.walk(SOURCEDIR): relroot = os.path.relpath(root, start=SOURCEDIR) @@ -78,7 +306,7 @@ def generate(fresh=False): for name in files: extension = name.split(".")[-1] - if extension not in ["css", "md"]: + if extension not in ["css", "jpg", "md", "png", "svg"]: continue relpath = os.path.join(relroot, name) @@ -88,8 +316,13 @@ def generate(fresh=False): else: dstpath = os.path.join(dstroot, name) if ((not os.path.exists(dstpath) or - os.path.getmtime(dstpath) <= os.path.getmtime(srcpath))): - if extension == "css": + os.path.getmtime(dstpath) <= + max(template_mtime, os.path.getmtime(srcpath)))): + # new post or modified post + anything_modified = True + if srcpath == INDEXMD: + continue # index will be processed separately + if extension in ["css", "jpg", "png", "svg"]: sys.stderr.write("copying %s\n" % relpath) shutil.copy(srcpath, dstpath) elif extension == "md": @@ -106,6 +339,8 @@ def generate(fresh=False): failed_builds += 1 sys.stderr.write("error: failed to generate %s" % relpath) + if anything_modified: + generate_index_and_feed() sys.stderr.write("build finished with %d errors\n" % failed_builds) return failed_builds |