#!/usr/bin/env python3 """A simple blog generator with Pandoc as backend.""" import argparse import datetime import io import os import re import shutil import subprocess import sys import tempfile import time import xml.etree.ElementTree as ET import bs4 import dateutil.parser import dateutil.tz ROOTDIR = os.path.dirname(os.path.realpath(__file__)) SOURCEDIR = os.path.join(ROOTDIR, "source") INDEXMD = os.path.join(SOURCEDIR, "index.md") TEMPLATEDIR = os.path.join(ROOTDIR, "templates") HTMLTEMPLATE = os.path.join(TEMPLATEDIR, "template.html") BUILDDIR = os.path.join(ROOTDIR, "build") ATOM = os.path.join(BUILDDIR, "atom.xml") INDEXHTML = os.path.join(BUILDDIR, "index.html") FEED_MAX_ENTRIES = 20 # Hack ET to support CDATA. # XML suuuuuucks. # http://stackoverflow.com/a/30019607/1944784 def CDATA(text=None): element = ET.Element('![CDATA[') element.text = text return element ET._original_serialize_xml = ET._serialize_xml def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs): if elem.tag == '![CDATA[': write("\n<{}{}]]>\n".format(elem.tag, elem.text)) if elem.tail: write(_escape_cdata(elem.tail)) else: return ET._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs) ET._serialize_xml = ET._serialize['xml'] = _serialize_xml class AtomFeed(object): """Class for storing atom:feed date and metadata.""" def __init__(self): """Define available attributes.""" self.author = None # atom:author self.generator = None # atom:generator, optional self.icon = None # atom:icon, optional self.logo = None # atom:logo, optional self.id_text = None # atom:id, just use URI self.id = None # atom:id self.links = [] # list of atom:link self.title = None # atom:title self.updated_datetime = None # update time as a datetime object self.updated = None # atom:updated self.entries = [] # list of atom:entry, in reverse time order self.feed = None # atom:feed, assembled def assemble_feed(self): """Assemble atom:feed.""" self.feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom") self.feed.append(self.title) for link in self.links: self.feed.append(link) self.feed.append(self.updated) self.feed.append(self.id) self.feed.append(self.author) if self.icon is not None: self.feed.append(self.icon) if self.logo is not None: self.feed.append(self.icon) if self.generator is not None: self.feed.append(self.generator) # include at most FEED_MAX_ENTRIES entries in the feed for entry in self.entries[:FEED_MAX_ENTRIES]: self.feed.append(entry.entry) def dump_feed(self): """Dump atom:feed XML.""" if self.feed is None: self.assemble_feed() return ET.tostring(self.feed).decode('utf-8') class AtomEntry(object): """Class for storing atom:entry data and metadata.""" def __init__(self): """Define available attributes.""" self.author = None # atom:author self.id_text = None # atom:id, just use URI self.id = None # atom:id self.relpath = None # HTML page path relative to home self.link = None # atom:link self.title_text = None # plain text title self.title = None # atom:title self.updated_datetime = None # update time as a datetime object self.updated = None # atom:updated self.content_html = None # content as HTML markup self.content = None # atom:content self.entry = None # atom:entry, assembled def assemble_entry(self): """Assemble atom:entry.""" self.entry = ET.Element("entry") self.entry.append(self.title) self.entry.append(self.link) self.entry.append(self.updated) self.entry.append(self.id) self.entry.append(self.author) self.entry.append(self.content) def dump_entry(self): """Dump atom:entry XML.""" if self.entry is None: self.assemble_entry() return ET.tostring(self.entry).decode('utf-8') # TODO: def new_post(): pass def generate_index(feed): """Generate index.html from index.md and a TOC.""" sys.stderr.write("generating index.html\n") # generate TOC tocbuff = io.StringIO() tocbuff.write('
') year = 10000 # will be larger than the latest year for quite a while # recall that entries are in reverse chronological order for entry in feed.entries: date = entry.updated_datetime if date.year < year: # write a new

tag with the smaller year year = date.year tocbuff.write(u'\n

{0}

\n\n'.format(year)) # write a new
  • entry (
  • ') # create tempfile with index.md and the TOC concatenated, and generate index.html from that fd, tmppath = tempfile.mkstemp() os.close(fd) with open(tmppath, 'w', encoding='utf-8') as tmpfile: if os.path.exists(INDEXMD): with open(INDEXMD, 'r', encoding='utf-8') as indexmd: tmpfile.write(u"%s\n\n
    \n\n" % indexmd.read()) tmpfile.write("%s\n" % tocbuff.getvalue()) tocbuff.close() pandoc_args = [ "pandoc", tmppath, "--template", HTMLTEMPLATE, "--highlight-style=pygments", "-o", INDEXHTML, ] try: subprocess.check_call(pandoc_args) except subprocess.CalledProcessError: failed_builds += 1 sys.stderr.write("error: failed to generate index.html\n") os.remove(tmppath) def generate_index_and_feed(): """Generate index.html and atom feed.""" sys.stderr.write("generating atom feed\n") # initialize feed feed = AtomFeed() # TODO: Put hard-coded values in a config file feed.author = ET.fromstring('Zhiming Wanghttps://github.com/zmwangxzmwangx@gmail.com') feed.generator = ET.Element("generator", uri="https://github.com/zmwangx/zmwangx.github.io") feed.generator.text = "pyblog" # TODO: feed.icon feed.id_text = "http://zmwangx.github.io" feed.id = ET.Element("id") feed.id.text = feed.id_text feed.links = [ ET.Element("link", href="http://zmwangx.github.io/atom.xml", rel="self"), ET.Element("link", href="http://zmwangx.github.io/"), ] feed.title_text = "dl? cmplnts?" feed.title = ET.fromstring("%s" % feed.title_text) # update time will be set after everthing finishes postspath = os.path.join(BUILDDIR, "blog") # traverse all posts in reverse time order for name in sorted(os.listdir(postspath), reverse=True): if re.match(r"^(\d{4})-(\d{2})-(\d{2}).*\.html", name): htmlpath = os.path.join(postspath, name) entry = AtomEntry() with open(htmlpath, encoding="utf-8") as htmlfile: soup = bs4.BeautifulSoup(htmlfile.read()) entry.author = feed.author # assume it's always the same author entry.id_text = "%s/blog/%s" % (feed.id_text, name) entry.id = ET.Element("id") entry.id.text = entry.id_text entry.relpath = "/blog/%s" % name entry.link = ET.Element("link", href=entry.id_text) entry.title_text = soup.title.text entry.title = ET.Element("title", type="html") entry.title.text = entry.title_text post_date = soup.find("meta", attrs={"name": "date"})["content"] entry.updated_datetime = dateutil.parser.parse(post_date) entry.updated = ET.Element("updated") entry.updated.text = entry.updated_datetime.isoformat() # extract the article content without header and footer article = soup.article article.header.extract() article.footer.extract() entry.content_html = ''.join([str(content) for content in article.contents]) entry.content = ET.Element("content", type="html") entry.content.append(CDATA(entry.content_html)) entry.assemble_entry() feed.entries.append(entry) generate_index(feed) feed.updated_datetime = datetime.datetime.fromtimestamp(round(time.time()), dateutil.tz.tzlocal()) feed.updated = ET.Element("updated") feed.updated.text = feed.updated_datetime.isoformat() with open(ATOM, 'w', encoding='utf-8') as atom: atom.write("%s\n" % feed.dump_feed()) sys.stderr.write("wrote atom.xml\n") def generate(fresh=False): """Generate the blog in BUILDDIR. Parameters ---------- fresh : bool If True, remove all existing build artifects and start afresh; otherwise, only copy or build new or modified files. Default is False. Returns ------- failed_builds : int Number of build failures. """ # pylint: disable=too-many-branches if not os.path.isdir(SOURCEDIR): raise OSError("source directory %s does not exist" % SOURCEDIR) if not os.path.exists(HTMLTEMPLATE): raise OSError("HTML template %s not found" % HTMLTEMPLATE) if not os.path.isdir(BUILDDIR): if os.path.exists(BUILDDIR): os.remove(BUILDDIR) os.mkdir(BUILDDIR, mode=0o755) if fresh: for name in os.listdir(BUILDDIR): if name == ".git": continue obj = os.path.join(BUILDDIR, name) if os.path.isdir(obj): shutil.rmtree(obj) else: os.remove(obj) failed_builds = 0 template_mtime = os.path.getmtime(HTMLTEMPLATE) anything_modified = False for root, _, files in os.walk(SOURCEDIR): relroot = os.path.relpath(root, start=SOURCEDIR) dstroot = os.path.join(BUILDDIR, relroot) if not os.path.isdir(dstroot): if os.path.exists(dstroot): os.remove(dstroot) os.mkdir(dstroot, mode=0o755) for name in files: extension = name.split(".")[-1] if extension not in ["css", "jpg", "md", "png", "svg", "ico"]: continue relpath = os.path.join(relroot, name) srcpath = os.path.join(root, name) if extension == "md": dstpath = os.path.join(dstroot, re.sub(r'\.md$', '.html', name)) else: dstpath = os.path.join(dstroot, name) if ((not os.path.exists(dstpath) or os.path.getmtime(dstpath) <= max(template_mtime, os.path.getmtime(srcpath)))): # new post or modified post anything_modified = True if srcpath == INDEXMD: continue # index will be processed separately if extension in ["css", "jpg", "png", "svg", "ico"]: sys.stderr.write("copying %s\n" % relpath) shutil.copy(srcpath, dstpath) elif extension == "md": sys.stderr.write("generating %s\n" % relpath) pandoc_args = [ "pandoc", srcpath, "--template", HTMLTEMPLATE, "--highlight-style=pygments", "-o", dstpath, ] try: subprocess.check_call(pandoc_args) except subprocess.CalledProcessError: failed_builds += 1 sys.stderr.write("error: failed to generate %s" % relpath) if anything_modified: generate_index_and_feed() sys.stderr.write("build finished with %d errors\n" % failed_builds) return failed_builds # TODO: def deploy(): pass # TODO: regenerate and deploy def gen_deploy(): pass # TODO: start HTTP server in another process and watch for changes def preview(): pass def main(): """CLI interface.""" description = "Simple blog generator in Python with Pandoc as backend." parser = argparse.ArgumentParser(description=description) parser.add_argument('action', choices=[ 'generate', 'regenerate', ]) args = parser.parse_args() if args.action == 'generate': exit(generate(fresh=False)) elif args.action == 'regenerate': exit(generate(fresh=True)) if __name__ == '__main__': main()