aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--generators/generators.py172
-rwxr-xr-xpyblog168
2 files changed, 173 insertions, 167 deletions
diff --git a/generators/generators.py b/generators/generators.py
index 9ad773c2..adfe819a 100644
--- a/generators/generators.py
+++ b/generators/generators.py
@@ -13,6 +13,13 @@ import subprocess
import lxml.etree as ET
+import urllib.parse
+import copy
+import email.utils
+import time
+
+from rss import *
+
from config.config import *
from utils import utils
@@ -517,3 +524,168 @@ def rewrite_title():
indexmd.write(re.sub(line, string, line))
else:
indexmd.write(line)
+
+
+def generate_index_and_feed():
+ """Generate index.html and feeds (atom and rss)."""
+ # pylint: disable=too-many-statements,attribute-defined-outside-init,invalid-name
+ sys.stderr.write("generating atom and rss feeds\n")
+ # initialize atom feed
+ feed = AtomFeed()
+ feed.author = ET.fromstring(
+ "<author>"
+ "<name>{author}</name>"
+ "<uri>{home}</uri>"
+ "<email>{email}</email>"
+ "</author>".format(author=AUTHOR, home=BLOG_HOME, email=AUTHOR_EMAIL))
+ feed.generator = ET.Element("generator", uri=GENERATOR_HOME_PAGE)
+ feed.generator.text = GENERATOR_NAME
+ if ATOM_ICON_PATH is not None:
+ feed.icon = ET.Element("icon")
+ feed.icon.text = urllib.parse.urljoin(BLOG_HOME, ATOM_ICON_PATH)
+ feed.id_text = BLOG_HOME
+ feed.id = ET.Element("id")
+ feed.id.text = feed.id_text
+ feed.links = [
+ ET.Element("link", href=urllib.parse.urljoin(BLOG_HOME, "atom.xml"), rel="self",
+ type="application/atom+xml"),
+ ET.Element("link", href=BLOG_HOME, rel="alternate",
+ type="text/html"),
+ ]
+ feed.title_text = BLOG_TITLE
+ feed.title = ET.fromstring("<title>{title}</title>".format(title=BLOG_TITLE))
+ feed.subtitle_text = BLOG_DESCRIPTION
+ feed.subtitle = ET.fromstring("<subtitle>{subtitle}</subtitle>"
+ .format(subtitle=BLOG_DESCRIPTION))
+ # initialize rss feed
+ rss = RssFeed()
+ rss.rssurl = urllib.parse.urljoin(BLOG_HOME, "rss.xml")
+ rss.title = ET.Element("title")
+ rss.title.text = BLOG_TITLE
+ rss.link = ET.Element("link")
+ rss.link.text = BLOG_HOME
+ rss.description = ET.Element("description")
+ rss.description.text = BLOG_DESCRIPTION
+ rss.language = ET.Element("language")
+ rss.language.text = LANGUAGE
+ rss.author_text = "{email} ({name})".format(email=AUTHOR_EMAIL, name=AUTHOR)
+ rss.managingEditor = ET.Element("managingEditor")
+ rss.managingEditor.text = rss.author_text
+ rss.webMaster = ET.Element("webMaster")
+ rss.webMaster.text = rss.author_text
+ rss.generator = ET.Element("generator")
+ rss.generator.text = "{generator} ({url})".format(generator=GENERATOR_NAME,
+ url=GENERATOR_HOME_PAGE)
+ rss.image = ET.Element("image")
+ if RSS_ICON_PATH is not None:
+ ET.SubElement(rss.image, "url").text = urllib.parse.urljoin(BLOG_HOME, RSS_ICON_PATH)
+ rss.image.append(copy.deepcopy(rss.title))
+ rss.image.append(copy.deepcopy(rss.link))
+ ET.SubElement(rss.image, "width").text = str(RSS_ICON_WIDTH)
+ ET.SubElement(rss.image, "height").text = str(RSS_ICON_HEIGHT)
+
+ # update times will be set after everthing finishes
+
+ for name in os.listdir(os.path.join(BUILDDIR, "blog")):
+ if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}.*\.html", name):
+ htmlpath = os.path.join(BUILDDIR, "blog", name)
+ entry = AtomEntry()
+ item = RssItem()
+ try:
+ with open(htmlpath, encoding="utf-8") as htmlfile:
+ soup = bs4.BeautifulSoup(htmlfile.read(), "lxml")
+
+ # generate atom entry
+ entry.author = copy.deepcopy(feed.author) # assume it's always the same author
+ entry_url = urllib.parse.urljoin(BLOG_HOME, "blog/%s" % name)
+ entry.id_text = entry_url
+ entry.id = ET.Element("id")
+ entry.id.text = entry_url
+ entry.relpath = "/blog/%s" % name
+ entry.link = ET.Element("link", href=entry_url)
+ entry.title_text = soup.title.text
+ entry.title = ET.Element("title", type="html")
+ entry.title.text = entry.title_text
+ post_date = soup.find("meta", attrs={"name": "date"})["content"]
+ entry.updated_datetime = dateutil.parser.parse(post_date)
+ entry.updated = ET.Element("updated")
+ # pylint: disable=no-member
+ entry.updated.text = entry.updated_datetime.isoformat()
+
+ # process content
+ tags_to_remove = []
+ # mark header and footer for removal
+ article = soup.article
+ if article.header is not None:
+ tags_to_remove.append(article.header)
+ # mark line numbers for removal
+ for line_number_span in article.find_all("span",
+ attrs={"class": "line-number"}):
+ tags_to_remove.append(line_number_span)
+ # mark script tags for removal
+ for script_tag in article.find_all("script"):
+ tags_to_remove.append(script_tag)
+ # make internal links absolute
+ utils.absolutify_links(article, entry_url)
+ # remove marked tags
+ for tag in tags_to_remove:
+ tag.extract()
+
+ entry.content_html = ''.join([str(content)
+ for content in article.contents])
+ entry.content = ET.Element("content", type="html")
+ entry.content.text = ET.CDATA(entry.content_html)
+ entry.assemble_entry()
+ feed.entries.append(entry)
+
+ # generate rss item
+ item.title = ET.Element("title")
+ item.title.text = entry.title_text
+ item.link = ET.Element("link")
+ item.link.text = entry_url
+ item.description = ET.Element("description")
+ item.description.text = ET.CDATA(entry.content_html)
+ item.author = ET.Element("author")
+ item.author.text = rss.author_text
+ item.guid = ET.Element("guid", isPermaLink="true")
+ item.guid.text = item.link.text
+ item.timestamp = entry.updated_datetime.timestamp()
+ item.pubDate = ET.Element("pubDate")
+ item.pubDate.text = email.utils.formatdate(item.timestamp, usegmt=True)
+ item.assemble_item()
+ rss.items.append(item)
+ except Exception:
+ sys.stderr.write("error: failed to generate feed entry from %s\n" % name)
+ with open(htmlpath, encoding="utf-8") as htmlfile:
+ sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read())
+ raise
+ # sort entries by reverse chronological order
+ feed.entries.sort(key=lambda entry: entry.updated_datetime, reverse=True)
+ rss.items.sort(key=lambda item: item.timestamp, reverse=True)
+
+ generate_index(feed)
+ generate_menu()
+ generate_table()
+ generate_blog_list(feed)
+ generate_notes_list()
+ rewrite_title()
+
+ feed.updated_datetime = utils.current_datetime()
+ feed.updated = ET.Element("updated")
+ feed.updated.text = feed.updated_datetime.isoformat()
+
+ rss.update_timestamp = time.time()
+ rss.pubDate = ET.Element("pubDate")
+ rss.pubDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True)
+ rss.lastBuildDate = ET.Element("lastBuildDate")
+ rss.lastBuildDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True)
+
+ with open(ATOM, "w", encoding="utf-8") as atom:
+ atom.write("%s\n" % feed.dump_feed(FEED_MAX_ENTRIES))
+ sys.stderr.write("wrote atom.xml\n")
+
+ with open(RSS, "w", encoding="utf-8") as rssxml:
+ rssxml.write("%s\n" % rss.dump_rss(FEED_MAX_ENTRIES))
+ sys.stderr.write("wrote rss.xml\n")
+
+ generate_sitemap(feed)
diff --git a/pyblog b/pyblog
index 11ea868b..3d443482 100755
--- a/pyblog
+++ b/pyblog
@@ -51,172 +51,6 @@ from config.config import *
from generators import generators
-
-def generate_index_and_feed():
- """Generate index.html and feeds (atom and rss)."""
- # pylint: disable=too-many-statements,attribute-defined-outside-init,invalid-name
- sys.stderr.write("generating atom and rss feeds\n")
- # initialize atom feed
- feed = AtomFeed()
- feed.author = ET.fromstring(
- "<author>"
- "<name>{author}</name>"
- "<uri>{home}</uri>"
- "<email>{email}</email>"
- "</author>".format(author=AUTHOR, home=BLOG_HOME, email=AUTHOR_EMAIL))
- feed.generator = ET.Element("generator", uri=GENERATOR_HOME_PAGE)
- feed.generator.text = GENERATOR_NAME
- if ATOM_ICON_PATH is not None:
- feed.icon = ET.Element("icon")
- feed.icon.text = urllib.parse.urljoin(BLOG_HOME, ATOM_ICON_PATH)
- feed.id_text = BLOG_HOME
- feed.id = ET.Element("id")
- feed.id.text = feed.id_text
- feed.links = [
- ET.Element("link", href=urllib.parse.urljoin(BLOG_HOME, "atom.xml"), rel="self",
- type="application/atom+xml"),
- ET.Element("link", href=BLOG_HOME, rel="alternate",
- type="text/html"),
- ]
- feed.title_text = BLOG_TITLE
- feed.title = ET.fromstring("<title>{title}</title>".format(title=BLOG_TITLE))
- feed.subtitle_text = BLOG_DESCRIPTION
- feed.subtitle = ET.fromstring("<subtitle>{subtitle}</subtitle>"
- .format(subtitle=BLOG_DESCRIPTION))
- # initialize rss feed
- rss = RssFeed()
- rss.rssurl = urllib.parse.urljoin(BLOG_HOME, "rss.xml")
- rss.title = ET.Element("title")
- rss.title.text = BLOG_TITLE
- rss.link = ET.Element("link")
- rss.link.text = BLOG_HOME
- rss.description = ET.Element("description")
- rss.description.text = BLOG_DESCRIPTION
- rss.language = ET.Element("language")
- rss.language.text = LANGUAGE
- rss.author_text = "{email} ({name})".format(email=AUTHOR_EMAIL, name=AUTHOR)
- rss.managingEditor = ET.Element("managingEditor")
- rss.managingEditor.text = rss.author_text
- rss.webMaster = ET.Element("webMaster")
- rss.webMaster.text = rss.author_text
- rss.generator = ET.Element("generator")
- rss.generator.text = "{generator} ({url})".format(generator=GENERATOR_NAME,
- url=GENERATOR_HOME_PAGE)
- rss.image = ET.Element("image")
- if RSS_ICON_PATH is not None:
- ET.SubElement(rss.image, "url").text = urllib.parse.urljoin(BLOG_HOME, RSS_ICON_PATH)
- rss.image.append(copy.deepcopy(rss.title))
- rss.image.append(copy.deepcopy(rss.link))
- ET.SubElement(rss.image, "width").text = str(RSS_ICON_WIDTH)
- ET.SubElement(rss.image, "height").text = str(RSS_ICON_HEIGHT)
-
- # update times will be set after everthing finishes
-
- for name in os.listdir(os.path.join(BUILDDIR, "blog")):
- if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}.*\.html", name):
- htmlpath = os.path.join(BUILDDIR, "blog", name)
- entry = AtomEntry()
- item = RssItem()
- try:
- with open(htmlpath, encoding="utf-8") as htmlfile:
- soup = bs4.BeautifulSoup(htmlfile.read(), "lxml")
-
- # generate atom entry
- entry.author = copy.deepcopy(feed.author) # assume it's always the same author
- entry_url = urllib.parse.urljoin(BLOG_HOME, "blog/%s" % name)
- entry.id_text = entry_url
- entry.id = ET.Element("id")
- entry.id.text = entry_url
- entry.relpath = "/blog/%s" % name
- entry.link = ET.Element("link", href=entry_url)
- entry.title_text = soup.title.text
- entry.title = ET.Element("title", type="html")
- entry.title.text = entry.title_text
- post_date = soup.find("meta", attrs={"name": "date"})["content"]
- entry.updated_datetime = dateutil.parser.parse(post_date)
- entry.updated = ET.Element("updated")
- # pylint: disable=no-member
- entry.updated.text = entry.updated_datetime.isoformat()
-
- # process content
- tags_to_remove = []
- # mark header and footer for removal
- article = soup.article
- if article.header is not None:
- tags_to_remove.append(article.header)
- # mark line numbers for removal
- for line_number_span in article.find_all("span",
- attrs={"class": "line-number"}):
- tags_to_remove.append(line_number_span)
- # mark script tags for removal
- for script_tag in article.find_all("script"):
- tags_to_remove.append(script_tag)
- # make internal links absolute
- utils.absolutify_links(article, entry_url)
- # remove marked tags
- for tag in tags_to_remove:
- tag.extract()
-
- entry.content_html = ''.join([str(content)
- for content in article.contents])
- entry.content = ET.Element("content", type="html")
- entry.content.text = ET.CDATA(entry.content_html)
- entry.assemble_entry()
- feed.entries.append(entry)
-
- # generate rss item
- item.title = ET.Element("title")
- item.title.text = entry.title_text
- item.link = ET.Element("link")
- item.link.text = entry_url
- item.description = ET.Element("description")
- item.description.text = ET.CDATA(entry.content_html)
- item.author = ET.Element("author")
- item.author.text = rss.author_text
- item.guid = ET.Element("guid", isPermaLink="true")
- item.guid.text = item.link.text
- item.timestamp = entry.updated_datetime.timestamp()
- item.pubDate = ET.Element("pubDate")
- item.pubDate.text = email.utils.formatdate(item.timestamp, usegmt=True)
- item.assemble_item()
- rss.items.append(item)
- except Exception:
- sys.stderr.write("error: failed to generate feed entry from %s\n" % name)
- with open(htmlpath, encoding="utf-8") as htmlfile:
- sys.stderr.write("dumping HTML:%s\n\n" % htmlfile.read())
- raise
- # sort entries by reverse chronological order
- feed.entries.sort(key=lambda entry: entry.updated_datetime, reverse=True)
- rss.items.sort(key=lambda item: item.timestamp, reverse=True)
-
- generators.generate_index(feed)
- generators.generate_menu()
- generators.generate_table()
- generators.generate_blog_list(feed)
- generators.generate_notes_list()
- generators.rewrite_title()
-
- feed.updated_datetime = utils.current_datetime()
- feed.updated = ET.Element("updated")
- feed.updated.text = feed.updated_datetime.isoformat()
-
- rss.update_timestamp = time.time()
- rss.pubDate = ET.Element("pubDate")
- rss.pubDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True)
- rss.lastBuildDate = ET.Element("lastBuildDate")
- rss.lastBuildDate.text = email.utils.formatdate(rss.update_timestamp, usegmt=True)
-
- with open(ATOM, "w", encoding="utf-8") as atom:
- atom.write("%s\n" % feed.dump_feed(FEED_MAX_ENTRIES))
- sys.stderr.write("wrote atom.xml\n")
-
- with open(RSS, "w", encoding="utf-8") as rssxml:
- rssxml.write("%s\n" % rss.dump_rss(FEED_MAX_ENTRIES))
- sys.stderr.write("wrote rss.xml\n")
-
- generators.generate_sitemap(feed)
-
-
# exclude_list is only inialized once to avoid constant disk IO
@utils.static_vars(exclude_list=None)
def generate_blog(fresh=False, report_total_errors=True):
@@ -347,7 +181,7 @@ def generate_blog(fresh=False, report_total_errors=True):
utils.postprocess_html_file(dstpath)
if anything_modified:
- generate_index_and_feed()
+ generators.generate_index_and_feed()
sys.stderr.write("done\n")
if report_total_errors: