a lot of work

Mainly generating feed and index.
author: Zhiming Wang <zmwangx@gmail.com> 2015-05-05 00:08:53 -0700
committer: Zhiming Wang <zmwangx@gmail.com> 2015-05-05 00:08:53 -0700
commit: d14e9ac5b86c911cb255ab30425790488c20fb4d (patch)
tree: 7a14faa9dd1e513850f33ced91d14729c3cb36e7
parent: 07bf43a314fe65ccd9c7cb663c3c6134a47cc269 (diff)
download: my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.tar.xz
my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.zip
4 files changed, 274 insertions, 26 deletions
diff --git a/pyblog b/pyblog
index df701ba1..55d08e71 100755
--- a/pyblog
+++ b/pyblog
@@ -1,21 +1,138 @@
 #!/usr/bin/env python3
 
+# TODO: timestamp to ISO
+
 """A simple blog generator with Pandoc as backend."""
 
 import argparse
+import datetime
+import io
 import os
 import re
 import shutil
 import subprocess
 import sys
+import tempfile
+import time
+import xml.etree.ElementTree as ET
+
+import bs4
+import dateutil.parser
+import dateutil.tz
 
 
 ROOTDIR = os.path.dirname(os.path.realpath(__file__))
 SOURCEDIR = os.path.join(ROOTDIR, "source")
-INDEX = os.path.join(SOURCEDIR, "index.md")
+INDEXMD = os.path.join(SOURCEDIR, "index.md")
 TEMPLATEDIR = os.path.join(ROOTDIR, "templates")
 HTMLTEMPLATE = os.path.join(TEMPLATEDIR, "template.html")
 BUILDDIR = os.path.join(ROOTDIR, "build")
+ATOM = os.path.join(BUILDDIR, "atom.xml")
+INDEXHTML = os.path.join(BUILDDIR, "index.html")
+
+FEED_MAX_ENTRIES = 20
+
+
+# Hack ET to support CDATA.
+# XML suuuuuucks.
+# http://stackoverflow.com/a/30019607/1944784
+
+def CDATA(text=None):
+    element = ET.Element('![CDATA[')
+    element.text = text
+    return element
+
+ET._original_serialize_xml = ET._serialize_xml
+
+def _serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs):
+
+    if elem.tag == '![CDATA[':
+        write("\n<{}{}]]>\n".format(elem.tag, elem.text))
+        if elem.tail:
+            write(_escape_cdata(elem.tail))
+    else:
+        return ET._original_serialize_xml(write, elem, qnames, namespaces,short_empty_elements, **kwargs)
+
+ET._serialize_xml = ET._serialize['xml'] = _serialize_xml
+
+
+class AtomFeed(object):
+    """Class for storing atom:feed date and metadata."""
+
+    def __init__(self):
+        """Define available attributes."""
+        self.author = None  # atom:author
+        self.generator = None  # atom:generator, optional
+        self.icon = None  # atom:icon, optional
+        self.logo = None  # atom:logo, optional
+        self.id_text = None  # atom:id, just use URI
+        self.id = None  # atom:id
+        self.links = []  # list of atom:link
+        self.title = None  # atom:title
+        self.updated_datetime = None  # update time as a datetime object
+        self.updated = None  # atom:updated
+        self.entries = []  # list of atom:entry, in reverse time order
+        self.feed = None  # atom:feed, assembled
+
+    def assemble_feed(self):
+        """Assemble atom:feed."""
+        self.feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom")
+        self.feed.append(self.title)
+        for link in self.links:
+            self.feed.append(link)
+        self.feed.append(self.updated)
+        self.feed.append(self.id)
+        self.feed.append(self.author)
+        if self.icon is not None:
+            self.feed.append(self.icon)
+        if self.logo is not None:
+            self.feed.append(self.icon)
+        if self.generator is not None:
+            self.feed.append(self.generator)
+        # include at most FEED_MAX_ENTRIES entries in the feed
+        for entry in self.entries[:FEED_MAX_ENTRIES]:
+            self.feed.append(entry.entry)
+
+    def dump_feed(self):
+        """Dump atom:feed XML."""
+        if self.feed is None:
+            self.assemble_feed()
+        return ET.tostring(self.feed).decode('utf-8')
+
+
+class AtomEntry(object):
+    """Class for storing atom:entry data and metadata."""
+
+    def __init__(self):
+        """Define available attributes."""
+        self.author = None  # atom:author
+        self.id_text = None  # atom:id, just use URI
+        self.id = None  # atom:id
+        self.relpath = None  # HTML page path relative to home
+        self.link = None  # atom:link
+        self.title_text = None  # plain text title
+        self.title = None  # atom:title
+        self.updated_datetime = None  # update time as a datetime object
+        self.updated = None  # atom:updated
+        self.content_html = None  # content as HTML markup
+        self.content = None  # atom:content
+        self.entry = None  # atom:entry, assembled
+
+    def assemble_entry(self):
+        """Assemble atom:entry."""
+        self.entry = ET.Element("entry")
+        self.entry.append(self.title)
+        self.entry.append(self.link)
+        self.entry.append(self.updated)
+        self.entry.append(self.id)
+        self.entry.append(self.author)
+        self.entry.append(self.content)
+
+    def dump_entry(self):
+        """Dump atom:entry XML."""
+        if self.entry is None:
+            self.assemble_entry()
+        return ET.tostring(self.entry).decode('utf-8')
 
 
 # TODO:
@@ -23,9 +140,118 @@ def new_post():
     pass
 
 
-# TODO:
-def generate_index():
-    pass
+def generate_index(feed):
+    """Generate index.html from index.md and a TOC."""
+
+    sys.stderr.write("generating index.html\n")
+
+    # generate TOC
+    tocbuff = io.StringIO()
+    tocbuff.write('<div class="indextoc" id="toc">')
+    year = 10000  # will be larger than the latest year for quite a while
+    # recall that entries are in reverse chronological order
+    for entry in feed.entries:
+        date = entry.updated_datetime
+        if date.year < year:
+            # write a new <h2 class="toc"> tag with the smaller year
+            year = date.year
+            tocbuff.write(u'\n<h2 class="toc" id="{0}" datetime="{0}">{0}</h2>\n\n'.format(year))
+
+        # write a new <li> entry (<ul>) in Markdown, in the format:
+        # * <time class="tocdate" datetime="2015-05-05T00:06:04-0700">May 5</time>
+        #   [Blah blah](/blog/2015-05-04-blah-blah.html)
+        monthday = date.strftime("%B %d")
+        tocbuff.write(u'* <time class="tocdate" datetime="%s">%s</time> [%s](%s)\n' %
+                      (date.isoformat(), monthday, entry.title_text, entry.relpath))
+    tocbuff.write('</div>')
+
+    # create tempfile with index.md and the TOC concatenated, and generate index.html from that
+    fd, tmppath = tempfile.mkstemp()
+    os.close(fd)
+    with open(tmppath, 'w', encoding='utf-8') as tmpfile:
+        if os.path.exists(INDEXMD):
+            with open(INDEXMD, 'r', encoding='utf-8') as indexmd:
+                tmpfile.write(u"%s\n\n<hr>\n\n" % indexmd.read())
+        tmpfile.write("%s\n" % tocbuff.getvalue())
+        tocbuff.close()
+
+    pandoc_args = [
+        "pandoc", tmppath,
+        "--template", HTMLTEMPLATE,
+        "--highlight-style=pygments",
+        "-o", INDEXHTML,
+    ]
+    try:
+        subprocess.check_call(pandoc_args)
+    except subprocess.CalledProcessError:
+        failed_builds += 1
+        sys.stderr.write("error: failed to generate index.html\n")
+    os.remove(tmppath)
+
+
+def generate_index_and_feed():
+    """Generate index.html and atom feed."""
+    sys.stderr.write("generating atom feed\n")
+    # initialize feed
+    feed = AtomFeed()
+    # TODO: Put hard-coded values in a config file
+    feed.author = ET.fromstring('<author><name>Zhiming Wang</name><uri>https://github.com/zmwangx</uri><email>zmwangx@gmail.com</email></author>')
+    feed.generator = ET.Element("generator", uri="https://github.com/zmwangx/zmwangx.github.io")
+    feed.generator.text = "pyblog"
+    # TODO: feed.icon
+    feed.id_text = "http://zmwangx.github.io"
+    feed.id = ET.Element("id")
+    feed.id.text = feed.id_text
+    feed.links = [
+        ET.Element("link", href="http://zmwangx.github.io/atom.xml", rel="self"),
+        ET.Element("link", href="http://zmwangx.github.io/"),
+    ]
+    feed.title_text = "dl? cmplnts?"
+    feed.title = ET.fromstring("<title>%s</title>" % feed.title_text)
+    # update time will be set after everthing finishes
+
+    postspath = os.path.join(BUILDDIR, "blog")
+    # traverse all posts in reverse time order
+    for name in sorted(os.listdir(postspath), reverse=True):
+        if re.match(r"^(\d{4})-(\d{2})-(\d{2}).*\.html", name):
+            htmlpath = os.path.join(postspath, name)
+            entry = AtomEntry()
+            with open(htmlpath, encoding="utf-8") as htmlfile:
+                soup = bs4.BeautifulSoup(htmlfile.read())
+                entry.author = feed.author  # assume it's always the same author
+                entry.id_text = "%s/blog/%s" % (feed.id_text, name)
+                entry.id = ET.Element("id")
+                entry.id.text = entry.id_text
+                entry.relpath = "/blog/%s" % name
+                entry.link = ET.Element("link", href=entry.id_text)
+                entry.title_text = soup.title.text
+                entry.title = ET.Element("title", type="html")
+                entry.title.text = entry.title_text
+                post_date = soup.find("meta", attrs={"name": "date"})["content"]
+                entry.updated_datetime = dateutil.parser.parse(post_date)
+                entry.updated = ET.Element("updated")
+                entry.updated.text = entry.updated_datetime.isoformat()
+                # extract the article content without header and footer
+                article = soup.article
+                article.header.extract()
+                article.footer.extract()
+                entry.content_html = ''.join([str(content)
+                                              for content in article.contents])
+                entry.content = ET.Element("content", type="html")
+                entry.content.append(CDATA(entry.content_html))
+                entry.assemble_entry()
+                feed.entries.append(entry)
+
+    generate_index(feed)
+
+    feed.updated_datetime = datetime.datetime.fromtimestamp(round(time.time()),
+                                                            dateutil.tz.tzlocal())
+    feed.updated = ET.Element("updated")
+    feed.updated.text = feed.updated_datetime.isoformat()
+
+    with open(ATOM, 'w', encoding='utf-8') as atom:
+        atom.write("%s\n" % feed.dump_feed())
+        sys.stderr.write("wrote atom.xml\n")
 
 
 def generate(fresh=False):
@@ -67,6 +293,8 @@ def generate(fresh=False):
                 os.remove(obj)
 
     failed_builds = 0
+    template_mtime = os.path.getmtime(HTMLTEMPLATE)
+    anything_modified = False
 
     for root, _, files in os.walk(SOURCEDIR):
         relroot = os.path.relpath(root, start=SOURCEDIR)
@@ -78,7 +306,7 @@ def generate(fresh=False):
 
         for name in files:
             extension = name.split(".")[-1]
-            if extension not in ["css", "md"]:
+            if extension not in ["css", "jpg", "md", "png", "svg"]:
                 continue
 
             relpath = os.path.join(relroot, name)
@@ -88,8 +316,13 @@ def generate(fresh=False):
             else:
                 dstpath = os.path.join(dstroot, name)
             if ((not os.path.exists(dstpath) or
-                 os.path.getmtime(dstpath) <= os.path.getmtime(srcpath))):
-                if extension == "css":
+                 os.path.getmtime(dstpath) <=
+                 max(template_mtime, os.path.getmtime(srcpath)))):
+                # new post or modified post
+                anything_modified = True
+                if srcpath == INDEXMD:
+                    continue # index will be processed separately
+                if extension in ["css", "jpg", "png", "svg"]:
                     sys.stderr.write("copying %s\n" % relpath)
                     shutil.copy(srcpath, dstpath)
                 elif extension == "md":
@@ -106,6 +339,8 @@ def generate(fresh=False):
                         failed_builds += 1
                         sys.stderr.write("error: failed to generate %s" %
                                          relpath)
+    if anything_modified:
+        generate_index_and_feed()
 
     sys.stderr.write("build finished with %d errors\n" % failed_builds)
     return failed_builds
diff --git a/source/css/theme.css b/source/css/theme.css
index b06fe2a2..60308aac 100644
--- a/source/css/theme.css
+++ b/source/css/theme.css
@@ -34,6 +34,10 @@ h2.meta {
     font-style: italic;
 }
 
+h2.toc {
+    text-align: left;
+}
+
 h3 {
     font-size: 12pt;
 }
@@ -88,3 +92,13 @@ footer .cc-icon {
     background-size: 16px;
     vertical-align: middle;
 }
+
+div.indextoc ul {
+    list-style-type: none;
+    padding-left: 2em;
+}
+
+div.indextoc ul li time.tocdate {
+    float: left;
+    width: 8em;
+}
diff --git a/source/index.md b/source/index.md
index 0236fb77..b654ab88 100644
--- a/source/index.md
+++ b/source/index.md
@@ -5,6 +5,6 @@ title: dl? cmplnts?
 
 I am an undergrad at Stanford (junior as of May 2015) studying mathematics and theoretical physics. I enjoy coding in my spare time, for fun and profit (getting things done). I am lazy in general so I try to automate things as much as possible.
 
-My first programming language was Pascal and I consider C my mother tongue, but recently I write my code in Python or Bash (non-interactive) most of the time. My text editor is Emacs, and my interactive shell is Zsh. I use four-space indents. What else is there to tell?
+My first programming language was Pascal and I consider C my mother tongue, but recently I write my code in Python or Bash (non-interactive) most of the time. My operating system is OS X, my text editor is Emacs, my interactive shell is Zsh, my package manager is Homebrew, and my default browser (at the moment — it changes a lot) is Google Chrome. I use four-space indents. What else is there to tell?
 
-The blog could be about anything, but most of the stuff should be technical.
+This blog could be about anything, but most of the stuff should be technical.
diff --git a/templates/template.html b/templates/template.html
index 1b527e5e..ff6c5f5a 100644
--- a/templates/template.html
+++ b/templates/template.html
@@ -1,31 +1,30 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"$if(lang)$ lang="$lang$" xml:lang="$lang$"$endif$>
+<!DOCTYPE html>
+<html>
 <head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta http-equiv="Content-Style-Type" content="text/css" />
-  <meta name="generator" content="pandoc" />
-  <meta name="author" content="Zhiming Wang" />
+<meta charset="UTF-8">
+<meta name="generator" content="pandoc" />
+<meta name="author" content="Zhiming Wang" />
 $if(date)$
-  <meta name="date" content="$date$" />
+<meta name="date" content="$date$" />
 $endif$
-  <title>$pagetitle$ — dl? cmplnts?</title>
-  <style type="text/css">code{white-space: pre;}</style>
+<title>$pagetitle$</title>
+<style type="text/css">code{white-space: pre;}</style>
 $if(quotes)$
-  <style type="text/css">q { quotes: "“" "”" "‘" "’"; }</style>
+<style type="text/css">q { quotes: "“" "”" "‘" "’"; }</style>
 $endif$
 $if(highlighting-css)$
-  <style type="text/css">
+<style type="text/css">
 $highlighting-css$
-  </style>
+</style>
 $endif$
 $for(css)$
-  <link rel="stylesheet" href="$css$" $if(html5)$$else$type="text/css" $endif$/>
+<link rel="stylesheet" href="$css$" $if(html5)$$else$type="text/css" $endif$/>
 $endfor$
 $if(math)$
-  $math$
+$math$
 $endif$
 $for(header-includes)$
-  $header-includes$
+$header-includes$
 $endfor$
 <link href='/css/normalize.css' media="all" rel="stylesheet" type="text/css">
 <link href='/css/theme.css' media="all" rel="stylesheet" type="text/css">
@@ -41,7 +40,7 @@ $if(subtitle)$
 $endif$
 <h2 class="meta">
 $if(date-display)$
-<time class="timestamp" $if(date)$timestamp="$date$"$endif$>$date-display$,</time>
+<time class="timestamp" $if(date)$datetime="$date$"$endif$>$date-display$,</time>
 $endif$
 by <span class="author">Zhiming Wang</span>
 </h2>
@@ -53,8 +52,8 @@ $toc$
 </div>
 $endif$
 $body$
-<hr>
 <footer>
+<hr>
 <a class="cc-icon" href="https://creativecommons.org/licenses/by/4.0/" target="_blank" title="Released under the Creative Commons Attribution 4.0 International license.">
 <a href="https://github.com/zmwangx">Zhiming Wang</a>
 </footer>
author	Zhiming Wang <zmwangx@gmail.com>	2015-05-05 00:08:53 -0700
committer	Zhiming Wang <zmwangx@gmail.com>	2015-05-05 00:08:53 -0700
commit	d14e9ac5b86c911cb255ab30425790488c20fb4d (patch)
tree	7a14faa9dd1e513850f33ced91d14729c3cb36e7
parent	07bf43a314fe65ccd9c7cb663c3c6134a47cc269 (diff)
download	my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.tar.xz my_new_personal_website-d14e9ac5b86c911cb255ab30425790488c20fb4d.zip