aboutsummaryrefslogtreecommitdiff
path: root/pyblog
diff options
context:
space:
mode:
Diffstat (limited to 'pyblog')
-rwxr-xr-xpyblog249
1 files changed, 242 insertions, 7 deletions
diff --git a/pyblog b/pyblog
index df701ba1..55d08e71 100755
--- a/pyblog
+++ b/pyblog
@@ -1,21 +1,138 @@
#!/usr/bin/env python3
+# TODO: timestamp to ISO
+
"""A simple blog generator with Pandoc as backend."""
import argparse
+import datetime
+import io
import os
import re
import shutil
import subprocess
import sys
+import tempfile
+import time
+import xml.etree.ElementTree as ET
+
+import bs4
+import dateutil.parser
+import dateutil.tz
ROOTDIR = os.path.dirname(os.path.realpath(__file__))
SOURCEDIR = os.path.join(ROOTDIR, "source")
-INDEX = os.path.join(SOURCEDIR, "index.md")
+INDEXMD = os.path.join(SOURCEDIR, "index.md")
TEMPLATEDIR = os.path.join(ROOTDIR, "templates")
HTMLTEMPLATE = os.path.join(TEMPLATEDIR, "template.html")
BUILDDIR = os.path.join(ROOTDIR, "build")
+ATOM = os.path.join(BUILDDIR, "atom.xml")
+INDEXHTML = os.path.join(BUILDDIR, "index.html")
+
+FEED_MAX_ENTRIES = 20
+
+
+# Hack ET to support CDATA.
+# XML suuuuuucks.
+# http://stackoverflow.com/a/30019607/1944784
+
+def CDATA(text=None):
+ element = ET.Element('![CDATA[')
+ element.text = text
+ return element
+
+ET._original_serialize_xml = ET._serialize_xml
+
+def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs):
+
+ if elem.tag == '![CDATA[':
+ write("\n<{}{}]]>\n".format(elem.tag, elem.text))
+ if elem.tail:
+ write(_escape_cdata(elem.tail))
+ else:
+ return ET._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs)
+
+ET._serialize_xml = ET._serialize['xml'] = _serialize_xml
+
+
+class AtomFeed(object):
+ """Class for storing atom:feed date and metadata."""
+
+ def __init__(self):
+ """Define available attributes."""
+ self.author = None # atom:author
+ self.generator = None # atom:generator, optional
+ self.icon = None # atom:icon, optional
+ self.logo = None # atom:logo, optional
+ self.id_text = None # atom:id, just use URI
+ self.id = None # atom:id
+ self.links = [] # list of atom:link
+ self.title = None # atom:title
+ self.updated_datetime = None # update time as a datetime object
+ self.updated = None # atom:updated
+ self.entries = [] # list of atom:entry, in reverse time order
+ self.feed = None # atom:feed, assembled
+
+ def assemble_feed(self):
+ """Assemble atom:feed."""
+ self.feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom")
+ self.feed.append(self.title)
+ for link in self.links:
+ self.feed.append(link)
+ self.feed.append(self.updated)
+ self.feed.append(self.id)
+ self.feed.append(self.author)
+ if self.icon is not None:
+ self.feed.append(self.icon)
+ if self.logo is not None:
+ self.feed.append(self.icon)
+ if self.generator is not None:
+ self.feed.append(self.generator)
+ # include at most FEED_MAX_ENTRIES entries in the feed
+ for entry in self.entries[:FEED_MAX_ENTRIES]:
+ self.feed.append(entry.entry)
+
+ def dump_feed(self):
+ """Dump atom:feed XML."""
+ if self.feed is None:
+ self.assemble_feed()
+ return ET.tostring(self.feed).decode('utf-8')
+
+
+class AtomEntry(object):
+ """Class for storing atom:entry data and metadata."""
+
+ def __init__(self):
+ """Define available attributes."""
+ self.author = None # atom:author
+ self.id_text = None # atom:id, just use URI
+ self.id = None # atom:id
+ self.relpath = None # HTML page path relative to home
+ self.link = None # atom:link
+ self.title_text = None # plain text title
+ self.title = None # atom:title
+ self.updated_datetime = None # update time as a datetime object
+ self.updated = None # atom:updated
+ self.content_html = None # content as HTML markup
+ self.content = None # atom:content
+ self.entry = None # atom:entry, assembled
+
+ def assemble_entry(self):
+ """Assemble atom:entry."""
+ self.entry = ET.Element("entry")
+ self.entry.append(self.title)
+ self.entry.append(self.link)
+ self.entry.append(self.updated)
+ self.entry.append(self.id)
+ self.entry.append(self.author)
+ self.entry.append(self.content)
+
+ def dump_entry(self):
+ """Dump atom:entry XML."""
+ if self.entry is None:
+ self.assemble_entry()
+ return ET.tostring(self.entry).decode('utf-8')
# TODO:
@@ -23,9 +140,118 @@ def new_post():
pass
-# TODO:
-def generate_index():
- pass
+def generate_index(feed):
+ """Generate index.html from index.md and a TOC."""
+
+ sys.stderr.write("generating index.html\n")
+
+ # generate TOC
+ tocbuff = io.StringIO()
+ tocbuff.write('<div class="indextoc" id="toc">')
+ year = 10000 # will be larger than the latest year for quite a while
+ # recall that entries are in reverse chronological order
+ for entry in feed.entries:
+ date = entry.updated_datetime
+ if date.year < year:
+ # write a new <h2 class="toc"> tag with the smaller year
+ year = date.year
+ tocbuff.write(u'\n<h2 class="toc" id="{0}" datetime="{0}">{0}</h2>\n\n'.format(year))
+
+ # write a new <li> entry (<ul>) in Markdown, in the format:
+ # * <time class="tocdate" datetime="2015-05-05T00:06:04-0700">May 5</time>
+ # [Blah blah](/blog/2015-05-04-blah-blah.html)
+ monthday = date.strftime("%B %d")
+ tocbuff.write(u'* <time class="tocdate" datetime="%s">%s</time> [%s](%s)\n' %
+ (date.isoformat(), monthday, entry.title_text, entry.relpath))
+ tocbuff.write('</div>')
+
+ # create tempfile with index.md and the TOC concatenated, and generate index.html from that
+ fd, tmppath = tempfile.mkstemp()
+ os.close(fd)
+ with open(tmppath, 'w', encoding='utf-8') as tmpfile:
+ if os.path.exists(INDEXMD):
+ with open(INDEXMD, 'r', encoding='utf-8') as indexmd:
+ tmpfile.write(u"%s\n\n<hr>\n\n" % indexmd.read())
+ tmpfile.write("%s\n" % tocbuff.getvalue())
+ tocbuff.close()
+
+ pandoc_args = [
+ "pandoc", tmppath,
+ "--template", HTMLTEMPLATE,
+ "--highlight-style=pygments",
+ "-o", INDEXHTML,
+ ]
+ try:
+ subprocess.check_call(pandoc_args)
+ except subprocess.CalledProcessError:
+ failed_builds += 1
+ sys.stderr.write("error: failed to generate index.html\n")
+ os.remove(tmppath)
+
+
+def generate_index_and_feed():
+ """Generate index.html and atom feed."""
+ sys.stderr.write("generating atom feed\n")
+ # initialize feed
+ feed = AtomFeed()
+ # TODO: Put hard-coded values in a config file
+ feed.author = ET.fromstring('<author><name>Zhiming Wang</name><uri>https://github.com/zmwangx</uri><email>zmwangx@gmail.com</email></author>')
+ feed.generator = ET.Element("generator", uri="https://github.com/zmwangx/zmwangx.github.io")
+ feed.generator.text = "pyblog"
+ # TODO: feed.icon
+ feed.id_text = "http://zmwangx.github.io"
+ feed.id = ET.Element("id")
+ feed.id.text = feed.id_text
+ feed.links = [
+ ET.Element("link", href="http://zmwangx.github.io/atom.xml", rel="self"),
+ ET.Element("link", href="http://zmwangx.github.io/"),
+ ]
+ feed.title_text = "dl? cmplnts?"
+ feed.title = ET.fromstring("<title>%s</title>" % feed.title_text)
+ # update time will be set after everthing finishes
+
+ postspath = os.path.join(BUILDDIR, "blog")
+ # traverse all posts in reverse time order
+ for name in sorted(os.listdir(postspath), reverse=True):
+ if re.match(r"^(\d{4})-(\d{2})-(\d{2}).*\.html", name):
+ htmlpath = os.path.join(postspath, name)
+ entry = AtomEntry()
+ with open(htmlpath, encoding="utf-8") as htmlfile:
+ soup = bs4.BeautifulSoup(htmlfile.read())
+ entry.author = feed.author # assume it's always the same author
+ entry.id_text = "%s/blog/%s" % (feed.id_text, name)
+ entry.id = ET.Element("id")
+ entry.id.text = entry.id_text
+ entry.relpath = "/blog/%s" % name
+ entry.link = ET.Element("link", href=entry.id_text)
+ entry.title_text = soup.title.text
+ entry.title = ET.Element("title", type="html")
+ entry.title.text = entry.title_text
+ post_date = soup.find("meta", attrs={"name": "date"})["content"]
+ entry.updated_datetime = dateutil.parser.parse(post_date)
+ entry.updated = ET.Element("updated")
+ entry.updated.text = entry.updated_datetime.isoformat()
+ # extract the article content without header and footer
+ article = soup.article
+ article.header.extract()
+ article.footer.extract()
+ entry.content_html = ''.join([str(content)
+ for content in article.contents])
+ entry.content = ET.Element("content", type="html")
+ entry.content.append(CDATA(entry.content_html))
+ entry.assemble_entry()
+ feed.entries.append(entry)
+
+ generate_index(feed)
+
+ feed.updated_datetime = datetime.datetime.fromtimestamp(round(time.time()),
+ dateutil.tz.tzlocal())
+ feed.updated = ET.Element("updated")
+ feed.updated.text = feed.updated_datetime.isoformat()
+
+ with open(ATOM, 'w', encoding='utf-8') as atom:
+ atom.write("%s\n" % feed.dump_feed())
+ sys.stderr.write("wrote atom.xml\n")
def generate(fresh=False):
@@ -67,6 +293,8 @@ def generate(fresh=False):
os.remove(obj)
failed_builds = 0
+ template_mtime = os.path.getmtime(HTMLTEMPLATE)
+ anything_modified = False
for root, _, files in os.walk(SOURCEDIR):
relroot = os.path.relpath(root, start=SOURCEDIR)
@@ -78,7 +306,7 @@ def generate(fresh=False):
for name in files:
extension = name.split(".")[-1]
- if extension not in ["css", "md"]:
+ if extension not in ["css", "jpg", "md", "png", "svg"]:
continue
relpath = os.path.join(relroot, name)
@@ -88,8 +316,13 @@ def generate(fresh=False):
else:
dstpath = os.path.join(dstroot, name)
if ((not os.path.exists(dstpath) or
- os.path.getmtime(dstpath) <= os.path.getmtime(srcpath))):
- if extension == "css":
+ os.path.getmtime(dstpath) <=
+ max(template_mtime, os.path.getmtime(srcpath)))):
+ # new post or modified post
+ anything_modified = True
+ if srcpath == INDEXMD:
+ continue # index will be processed separately
+ if extension in ["css", "jpg", "png", "svg"]:
sys.stderr.write("copying %s\n" % relpath)
shutil.copy(srcpath, dstpath)
elif extension == "md":
@@ -106,6 +339,8 @@ def generate(fresh=False):
failed_builds += 1
sys.stderr.write("error: failed to generate %s" %
relpath)
+ if anything_modified:
+ generate_index_and_feed()
sys.stderr.write("build finished with %d errors\n" % failed_builds)
return failed_builds