# BlogManager
# BlogPublisher
# TikzManager
# ImageManager
from titlecase import titlecase
import pandas as pd
from datetime import datetime
from itertools import count
from bs4 import BeautifulSoup
import argparse
from pathlib3x import Path
from subprocess import Popen, PIPE
import hashlib
import logging
import re
import numpy as np
logger = logging.getLogger(__name__)
try:
from pdf2image import convert_from_path
has_convert_from_path = True
except ModuleNotFoundError:
# logger.warning('No pdf2image...cannot convert PDF files to png.')
has_convert_from_path = False
# blog website files
BLOG_PATH = Path.home() / 'new_mynl/blog'
# folder for created images (keep separate)
STATIC_IMAGE_PATH = (BLOG_PATH / '../static/blog_img').resolve()
# this is not printing??
# poor mans
# logger = type('asd', (), {})
# logger.info = print
# logger.warning = print
# logger.debug = print
# logger.error = print
[docs]class BlogManager(object):
"""
BlogManager is used by the flask_app to do all the content management for the blog.
Flask app handles the actual rendering, but all reports and extracts come from BlogManager.
"""
[docs] def __init__(self, post_dir=''):
"""
"""
if post_dir == '':
self.post_dir = BLOG_PATH
elif isinstance(post_dir, Path) is False:
self.post_dir = Path(post_dir)
else:
self.post_dir = post_dir
self._blog_df = None
self._tag_to_post_df = None
self._top_tags = None
[docs] def refresh(self):
"""
Force cache update
"""
self._blog_df = None
self._tag_to_post_df = None
@property
def blog_df(self):
if self._blog_df is None:
self._blog_df = self.blog_entries_to_df(self.post_dir)
return self._blog_df
@property
def tag_to_post_df(self):
if self._tag_to_post_df is None:
self._tag_to_post_df = pd.DataFrame(columns=['tag', 'hash'])
i = count()
for n, x in self.blog_df.iterrows():
for t in x.tag_list:
self._tag_to_post_df.loc[next(i)] = [t, n]
return self._tag_to_post_df
@property
def top_tags(self):
if self._top_tags is None:
self._top_tags = self.tag_to_post_df.groupby('tag').count(). \
reset_index(drop=False).sort_values(['hash', 'tag'],
ascending=[False, True]). \
set_index('tag')
return self._top_tags
@property
def top_tag_menu(self):
"""
Menu items for top [5] tags
"""
ans = []
dd_ans = []
for n, r in self.top_tags.head(5).iterrows():
ans.append(
f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n}</a></li>'
# to include number of posts in parens
# f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n} ({r["hash"]})</a></li>'
)
for n, r in self.top_tags.iloc[5:].iterrows():
dd_ans.append(
f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>'
# f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>'
)
return '\n'.join(ans), '\n'.join(dd_ans)
[docs] def search_tag(self, regex):
"""
Search tags (list of individual tags) using a regex
Only finds one tag type
"""
# tags --> hashes
idx = self.tag_to_post_df['tag'].str.match(regex, flags=re.IGNORECASE)
# get post hash_idx, note there may be duplicates
hash_idx = set(self.tag_to_post_df.loc[idx].hash)
return self.blog_df.loc[hash_idx]
[docs] def search_regex(self, regex, field):
"""
Search through field using a regex and return relevant posts
tag, title, html, post_date, access, modify, create, size
NOT tag_list, words (a set)
``field`` must be a column in ``tag_to_post_df``
Regex runs a contains query: you are responsible for start/finish
"""
# regex mode
hash_idx = self.blog_df[field].str.contains(regex, flags=re.IGNORECASE)
return self.blog_df.loc[hash_idx]
[docs] def search_query(self, query):
"""
Send well formed query to blog_df
"""
# query mode
return self.blog_df.query(query)
def _search(self, query):
"""
Implements ! and and searching.
:param query:
:return:
"""
if query[0] == '!':
matching_posts = self.search_regex(query[1:], 'words')
list_label = f'Recent posts with words matching "{query[1:]}"'
elif query.find(' and ') >= 0:
queries = query.split(" and ")
idx = []
for r in queries:
match = self.search_tag(r)
idx.append(match.index)
ans = set(idx[0])
for j in idx[1:]:
ans = ans.intersection(j)
if len(ans) == 0:
break
# this works even if ans is empty
matching_posts = self.blog_df.loc[ans]
list_label = f'Recent posts tagged {query}'
else:
matching_posts = self.search_tag(query)
list_label = f'Recent posts with tags matching "{query}"'
return matching_posts, list_label
@property
def tags(self):
"""
Returns an iterable of distinct tags
"""
return self.tag_to_post_df['tag'].unique()
[docs] def make_card_list(self, tag):
"""
return six most recent posts, to be rendered in snapshot cards
"""
matching_posts, list_label = self._search(tag)
row_iter = matching_posts.sort_values(
'modify_date', ascending=False).head(12).iterrows()
card_list = []
try:
for r in range(2):
cards = []
card_list.append(cards)
for i in range(3):
h, x = next(row_iter)
short_tags = self.tags_to_tag_links(x['tags'], glue=', ')
c1 = dict(title=x['title'], text=x['summary'],
hash=h, tags=short_tags, posted=x['post_date'])
cards.append(c1)
except StopIteration:
# bomb out when you get to the end...
# return BLOG.search_tag(tag).to_html()
pass
return card_list
[docs] def list_of_posts(self, regex):
"""
Return list of matching blog files formatted as a HTML list. Three modes:
1. regex is !str: run as a regex query against all words
2. regex contains " and ", split and run intersection of type 3
3. regix run as match against tag list.
"""
matching_posts, list_label = self._search(regex)
posts = []
for n, r in matching_posts.sort_values(['modify_date', 'title'], ascending=(False, True)).iterrows():
posts.append(f'<li> <a href="/blog?id={n}" class="text-light">{r["post_date"]} | '
f'{r["title"]}</a>')
posts = '\n'.join(posts)
return posts, list_label
[docs] def report(self, kind):
"""
Create a report about the posts; returns html
kind = title, tag, date, statistics
"""
renamer = {'post_date': 'Post date', 'title': 'Title', 'size': 'Size', 'tag': 'Tag',
'tags': 'Tags', 'avg_date': 'Avg date', 'number': 'Number', 'avg_size': 'Avg size',
'max_size': 'Max size', 'cross_tags': 'Num tags', 'modify_date': 'Modify date'}
if kind == 'title':
bit = self.blog_df[['post_date',
'modify_date', 'title', 'size', 'tags']]
bit = bit.sort_values(['title', 'post_date'],
ascending=[True, False])
# bit.index = range(1, len(bit)+1)
# bit.index.name = 'post'
bit.title = [
f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
bit = bit[['title', 'post_date', 'size', 'tags']].rename(
columns=renamer)
s = bit.style.hide_index()
s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
{'selector': 'th', 'props': [('text-align', 'left')]}])
return s.render()
elif kind == 'date':
bit = self.blog_df[['post_date', 'title', 'tags']]
bit = bit.sort_values(['post_date', 'title'],
ascending=[False, True])
# bit.index = range(1, len(bit)+1)
# bit.index.name = 'post'
bit.title = [
f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
bit = bit[['post_date', 'title', 'tags']].rename(columns=renamer)
s = bit.style.hide_index()
s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
{'selector': 'th', 'props': [('text-align', 'left')]}])
return s.render()
elif kind == 'modify':
bit = self.blog_df[['modify_date', 'title', 'tags']]
bit = bit.sort_values(
['modify_date', 'title'], ascending=[False, True])
# bit.index = range(1, len(bit)+1)
# bit.index.name = 'post'
bit.title = [
f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
bit = bit[['modify_date', 'title', 'tags']].rename(columns=renamer)
s = bit.style.format(
formatter={'Modify date': "{:%Y-%m-%d}"}).hide_index()
s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
{'selector': 'th', 'props': [('text-align', 'left')]}])
return s.render()
elif kind == 'tag':
bit = self.tag_to_post_df.merge(self.blog_df[['post_date', 'title']],
left_on='hash', right_index=True, how='left')
bit = bit.sort_values(['tag', 'title'])
# bit.index = range(1, len(bit)+1)
# bit.index.name = 'post'
bit.title = [
f'<a href="/blog?id={h}">{i}</a>' for _, (h, i) in bit[['hash', 'title']].iterrows()]
bit.tag = [f'<a href="/blog/{t}">{t}</a>' for t in bit['tag']]
bit = bit[['tag', 'title', 'post_date']].rename(columns=renamer)
s = bit.style.hide_index()
s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
{'selector': 'th', 'props': [('text-align', 'left')]}])
return s.render()
elif kind == 'statistics':
bit = self.tag_to_post_df.merge(
self.blog_df[['post_date', 'modify_date',
'size', 'title', 'summary', 'tag_list']],
left_on='hash', right_index=True, how='left')
# weird error
gb = bit.groupby('tag').agg(number=('hash', np.size),
avg_size=('size', np.mean),
max_size=('size', np.max),
avg_date=('modify_date', np.mean),
cross_tags=('tag_list', lambda x: np.mean(
[len(i) for i in x]))
).sort_values('number', ascending=False)
gb['tag'] = [f'<a href="/blog/{i}">{i}</a>' for i in gb.index]
gb = gb[['tag', 'avg_date', 'number', 'avg_size', 'max_size', 'cross_tags']]. \
sort_values(['number', 'avg_date'], ascending=[False, True]). \
rename(columns=renamer)
# see https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
s = gb.style.format(formatter={'Avg date': "{:%Y-%m-%d}", 'Avg size': '{:,.0f}',
'Max size': '{:,.0f}', 'Num tags': '{:.1f}'}).hide_index()
s = s.set_table_styles(
[{'selector': 'td', 'props': [('text-align', 'right')]}])
return s.render()
else:
return '<h2>Unknown report type</h2>'
[docs] @staticmethod
def blog_entries_to_df(p):
"""
Convert blog entries in directory p (a Path) to a dataframe.
Parse tags, title, etc.
* Must start with an h1; generally that is the only h1 in the document
* A final comment with a list of tags; if there is no tag it is tagged NOC
* A span, usually near the top, with class description that becomes the og summary and the card summary. If
missing then the first 150 or so (to a word break) are used.
* An image with class og_image that becomes the og image (not too large!)
"""
assert p.exists()
posts = list(p.glob('*.html'))
ans = []
for post in posts:
# logger.info(f'Processing {post.stem}')
try:
_, title, date, tags, _ = re.split(
r'(.*)\-\-\-(....\-..\-..)\-(.*)', post.stem)
title = titlecase(title.replace('-', ' '))
tags = tags.replace('-', ', ').replace('_', ' ')
txt = post.read_text(encoding='utf-8')
stats = post.stat()
access, modify, create = map(lambda x: pd.to_datetime(x, unit='s', utc=True).tz_convert('US/Eastern'),
[stats.st_atime, stats.st_mtime, stats.st_ctime])
soup = BeautifulSoup(txt, "html.parser")
# figure the description
temp = soup('span', class_='description')
if len(temp) > 0:
summary = str(temp[0])
else:
# no canned description
extract_length = 35
n = 0
bit = []
for e in soup(['h2', 'h3', 'p', 'ul', 'ol', 'div', 'span']):
# need to get something...
if n == 0:
if len(e.text) < extract_length:
bit.append(e.text)
else:
temp1 = []
temp2 = 0
for w in e.text.split():
if temp2 + len(w) < extract_length:
temp1.append(w)
temp2 += 1
else:
break
# _ = ' '.join(temp1)
# print('Appending', n, len(_), _)
bit.append(' '.join(temp1))
break
elif n + len(e.text) < extract_length:
bit.append(e.text)
n += len(e.text.split())
else:
break
summary = '\n'.join(bit)
# figure an og_image
temp = soup('img', id='og_image')
if len(temp) > 0:
og_image = temp[0]['src']
else:
temp = soup('img')
if len(temp) > 0:
og_image = temp[0]['src']
else:
og_image = ''
h = hashlib.md5(post.stem.encode('utf-8')).digest().hex()
ans.append([date, access, modify, create, stats.st_size, tags, tags.split(', '),
title, txt, summary, og_image, str(post), h])
except ValueError:
logger.error(f'{post}')
df = pd.DataFrame(ans, columns=['post_date', 'access_date', 'modify_date', 'create_date', 'size',
'tags', 'tag_list', 'title', 'html', 'summary', 'og_image', 'path', 'hash'])
df['words'] = df.html.str.replace(
pat=r'(?=<!--)([\s\S]*?)-->|<.*?>', repl='', flags=re.MULTILINE)
# bag of words with no punct and lower case
# df['bow'] = df.words.apply(lambda x: set(re.sub(r'[,.?!{}()\[\]]', '', x.lower(), flags=re.MULTILINE).split()))
# hash based on the FILE NAME rather than the content... allows changes when entry has a fixed date
# created above from post
# originally
# df.index = [hashlib.md5(t.encode('utf-8')).digest().hex() for t in df.html]
# df.index.name = 'hash'
df = df.set_index('hash')
# remove duplicates (TODO: this shouldn't happen!) keep the oldest version of each post...remember the text is the same
df = df.sort_values(['modify_date', 'title'], ascending=[False, True])
dups = df.index.duplicated(keep='last')
df = df.loc[~dups].copy()
return df
[docs] @staticmethod
def name_to_parts(fn):
""" fn is a Path """
fn = fn.stem
# date = fn[:10]
tags, title = re.split(r'(.*?)\-\-\-(.*)', fn[11:])[1:-1]
tags = tags.split('-')
title = titlecase(title.replace('-', ' '))
return tags, title
[docs] @staticmethod
def tags_to_tag_links(tags, glue):
"""
Convert csv list of tags into links
glue = ', ' or '\n' are common
"""
ans = []
for t in tags.split(','):
t = t.strip()
s = f'<a href="/blog/{t}"">{t}</a>'
ans.append(s)
return glue.join(ans)
[docs]class PublisherBase(object):
"""
Container for some static functions.
Handles workflow tracking
"""
_macros = r"""\def\E{\mathsf{E}}
\def\Var{\mathsf{Var}}
\def\var{\mathsf{var}}
\def\SD{\mathsf{SD}}
\def\VaR{\mathsf{VaR}}
\def\CTE{\mathsf{CTE}}
\def\WCE{\mathsf{WCE}}
\def\AVaR{\mathsf{AVaR}}
\def\CVaR{\mathsf{CVaR}}
\def\TVaR{\mathsf{TVaR}}
\def\biTVaR{\mathsf{biTVaR}}
\def\ES{\mathsf{ES}}
\def\EPD{\mathsf{EPD}}
\def\cov{\mathsf{cov}}
\def\corr{\mathsf{Corr}}
\def\Pr{\mathsf{Pr}}
\def\ecirc{\accentset{\circ} e}
\def\dsum{\displaystyle\sum}
\def\dint{\displaystyle\int}
\def\AA{\mathcal{A}}
\def\bb{\bm{b}}
\def\ww{\bm{w}}
\def\xx{\bm{x}}
\def\yy{\bm{y}}
\def\HH{\bm{H}}
\def\FFF{\mathscr{F}}
\def\FF{\mathcal{F}}
\def\MM{\mathcal{M}}
\def\OO{\mathscr{O}}
\def\PPP{\mathscr{P}}
\def\PP{\mathsf{P}}
\def\QQ{\mathsf{Q}}
\def\RR{\mathbb{R}}
\def\ZZ{\mathbb{Z}}
\def\NN{\mathbb{N}}
\def\XXX{\mathcal{X}}
\def\XX{\bm{X}}
\def\ZZZ{\mathcal{Z}}
\def\bbeta{\bm{\beta}}
\def\cp{\mathsf{CP}}
\def\atan{\mathrm{atan}}
\def\ecirc{\accentset{\circ} e}
\def\tpx{{{}_tp_x}}
\def\kpx{{{}_kp_x}}
\def\tpy{{{}_tp_y}}
\def\tpxy{{{}_tp_{xy}}}
\def\tpxybar{{{}_tp_{\overline{xy}}}}
\def\tqx{{{}_tq_x}}"""
[docs] def __init__(self):
# path for posts
self.web_path = BLOG_PATH
# path for images
self.static_img_path = STATIC_IMAGE_PATH
# log workflow (TODO: sad that logged items are detached from where they are called)
self._workflow = []
self._n = 0
[docs] def web_link(self, web_file):
"""
Create a link of web_file relative to static_img_path
"""
return '/' + (web_file.resolve().relative_to((self.web_path / '..').resolve())).as_posix()
[docs] def workflow_reset(self):
self._workflow = []
self._n = 0
[docs] def workflow(self, msg):
"""
Add a message to the workflow
:param msg:
:return:
"""
logger.info(msg)
self._n += 1
self._workflow.append(f'({self._n:02d}) {msg}')
[docs] def workflow_show(self):
print('\n'.join(self._workflow))
[docs] def workflow_get(self):
"""
Return the workflow object as an HTML comment
"""
nl = '\n'
return f"\n\n<!--\n{nl.join(self._workflow)}\n-->\n"
[docs] def workflow_raw(self):
return self._workflow
[docs] def process_includes(self, *, txt='', fn=None):
"""
Stand-alone process includes. txt = current status of buffer. fn = Path object source.
If txt=None then text read from fm. If txt=='' then txt read from fn. This allows it to
be used stand-alone.
Not static because calls functions that access the workflow. But can be part of the base.
:param txt:
:param fn:
:return: txt with includes resolved.
"""
if txt == '':
txt = fn.read_text(encoding='utf-8')
if txt.find('@@@') < 0:
return txt
# else have work to do
if fn is None:
fn = Path('.')
base_dir = fn.parent.resolve()
n_includes = 0
# first, substitute for all NNN specs (keep this for backwards compatibility)
# assumes you are in the current directory
file_map = {i[0:3]: i for i in base_dir.parent.glob("*.md")}
txt, n_includes = self._process_includes(
txt, base_dir, n_includes, file_map)
self.workflow(f'IMPORT: {n_includes} files imported')
return txt
def _process_includes(self, txt, base_dir, n_includes, file_map):
"""
Process @@@ include elements.
From markdown_make.py without color_includes logic
Iterative processing of include files
file_map looks for nnn_something.md files in the current directory
base_dir = directory name
"""
includes = re.findall(
r'@@@include ([\./]*)([0-9]{3}|[0-9A-Za-z])([^\n]+\.[a-z]+)?', txt)
for res_ in includes:
original_match = ''.join(res_)
# logger.info(res_, file_map)
# res_[1] looks for nnn type files and tries to find them in file_map
if res_[2] == '':
res = file_map[res_[1]]
# logger.info(f'REPLACING {res_} with {res}')
else:
res = original_match
# logger.info(f'using {"".join(res_)} as {res}')
self.workflow(f'IMPORT: Importing {res}')
n_includes += 1
try:
repl = (base_dir / res).read_text(encoding='utf-8')
repl = self._strip_yaml(repl)
repl, n_includes = self._process_includes(
repl, base_dir, n_includes, file_map)
txt = txt.replace(f'@@@include {original_match}', repl)
except FileNotFoundError:
self.workflow(
f'IMPORT: WARNING @@@ included file {res} not found...ignoring')
return txt, n_includes
def _strip_yaml(self, text):
"""
Strip starging yaml, between first --- and next --- from text.
Applies to included files.
From markdown_make.py.
:param text:
:return:
"""
if text[:3] != '---':
return text
else:
self.workflow('Stripped YAML')
stext = text.split('\n')
stext.pop(0)
n = 0
for ln in stext:
if ln != '---':
n += 1
else:
n += 1
return '\n'.join(stext[n:])
[docs] def process_tex_macros(self, md_in, report=False):
"""
Expand standard general.tex macros in the md_in text blog
If ``additional_macros is not None`` then use it to update the standard list
If ``report is True`` then just return the dictionary of macro substitutions
"""
m, regex = PublisherBase.tex_to_dict(PublisherBase._macros)
if report is True:
return m, regex
md_in, n = re.subn(regex, lambda x: m.get(
x[0]), md_in, flags=re.MULTILINE)
self.workflow(f'MACROS: {n} TeX macros substitutions')
# lcroof is not handled
return md_in
[docs] @staticmethod
def file_name(s):
"""
Create a sensible random file name from a string s
:param s:
:return:
"""
return Path('TMP_' + PublisherBase.string_hash(s) + '.tex')
[docs] @staticmethod
def string_hash(s):
"""
Return hash of string s, as a hex string
:param s:
:return:
"""
return hashlib.md5(s.encode('utf-8')).digest().hex()
[docs] @staticmethod
def run_command(command, flag=True):
"""
Run a command and show results. Allows for weird xx behavior
:param command:
:param flag:
:return:
"""
with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
line1 = p.stdout.read()
line2 = p.stderr.read()
exit_code = p.poll()
if line1:
logger.info(line1[-250:])
if line2:
if flag:
raise ValueError(line2)
else:
logger.info(line2)
return exit_code
[docs] @staticmethod
def tidy():
"""
tidy up the cwd
:return:
"""
for pattern in ['TMP_*.*', '*.bak', '*.log', '*.aux', '*.me']:
for f in Path('.').glob(pattern):
logger.info(f'unlinking {f}')
f.unlink()
[docs] @staticmethod
def convert_pdfs(dir_name, output_folder='', pattern='*.pdf', format='png', dpi=200, transparent=True):
"""
Bulk conversion of all pdfs in dir_name to png. Linux (pdf2image) only. Pre-run!
Does not adjust names in the text.
"""
if type(dir_name) == str:
dir_name = Path(dir_name)
if output_folder == '':
output_folder = dir_name
for f in dir_name.glob(pattern):
fo = f.stem
logger.info(f'converting {f.name} to {fo}')
convert_from_path(str(f), dpi=dpi, output_folder=output_folder, fmt=format, transparent=transparent,
output_file=fo, single_file=True)
[docs] @staticmethod
def tex_to_dict(text):
"""
Convert text, a series of def{} macros into a dictionary
returns the dictionary and the regex of all keys
"""
smacros = text.split('\n')
smacros = [BlogPublisher.tex_splitter(i) for i in smacros]
m = {i: j for (i, j) in smacros}
regex = '|'.join([re.escape(k) for k in m.keys()])
return m, regex
[docs] @staticmethod
def tex_splitter(x):
"""
x is a single def style tex macro
"""
x = x.replace('\\def', '')
i = x.find('{')
return x[:i], x[i + 1:-1]
[docs] @staticmethod
def post_tags_and_dates(dir_path):
"""
Read info from a set of proto posts
:param dir_path:
:return:
"""
if isinstance(dir_path, Path) is False:
dir_path = Path(dir_path)
ans = []
for f in dir_path.glob('*.md'):
t = f.read_text()
st = t.strip().split('\n')
tags = st[-1].strip()
tags = tags.replace("<!--", "").replace("-->", "").strip()
date = st[-2].strip()
if date[:10] == "<!-- date:":
date = date[10:-4].strip()
else:
date = ''
ans.append([f.name, date, tags])
setup = pd.DataFrame(ans, columns=['file', 'date', 'tags'])
return setup
[docs]class BlogPublisher(PublisherBase):
[docs] def __init__(self, source_dir='.', update=False, dry_run=True, tex_engine='pdflatex'):
"""
Manage creation of HTML blog-post files, including creating and image files and changing links in Markdown.
Objective is to published as-is files that create TeX on the web. Adjustments: PDF images to PND/JPG/SVG
(change the link and create the PNG) and TikZ (create SVG file, find begin{figure} find caption and change
Markdown).
Adds a final comment to the HTML explaining where the file came from.
Creates a .bak file with the same name and including all the edits. These SHOULD NEVER BE EDITED!
web_path is the destination for the created HTML files. If =='' then BLOG_PATH is used
If ``update`` is True overwrite existing older HTML files, otherwise skip if exists.
If ``dry_run`` is True just explain what would happen.
Note: defaults in fail safe mode!
The Markdown file can optionally have:
* A final comment with a list of tags; if there is no tag it is tagged NOC
* A span, usually near the top, with class description that becomes the og summary and the card summary. If
missing then the first 150 or so (to a word break) are used.
* An image with class og_image that becomes the og image (not too large!)
These elements are used by BlogManager.
:param source_dir: source directory for files to publish, default is cwd
:param web_path: website directory; files are published here
:param update:
:param dry_run:
:param tex_engine: pdflatex (fast but not fonts) or lualatex (slow but change fonts)
"""
if isinstance(source_dir, Path):
self.source_dir = source_dir
else:
self.source_dir = Path(source_dir).resolve()
self.update = update
self.dry_run = dry_run
self.tex_engine = tex_engine
super().__init__()
[docs] def publish_file(self, fn):
"""
fn is a markdown file to post file, a str or Path object. Workflow is
* read markdown, split, find tags (last comment), make post filename
* check timing and existing blog post files to see if there are any updates
* expand all @@@s (per markdown_make)
* expand all basic tex macros
* deal with pdf graphics (png/jpg/svg versions of pdf files must be created separately); if none
is found, leave as pdf
* deal with TikZ pictures and figures (after graphics because it introduces new ![] elements)
* append workflow, including provenance of file
* Save .bak file
* pandoc create HTML file
:param fn: name of file (or path to file) of markdown
:param tex_engine: if pdflatex uses blog/format/tikz.fmt and pdflatex, giving default fonts.
If lualatex runs without template, slower but will give the fonts.
"""
if isinstance(fn, str) is True:
fn = self.source_dir / fn
assert fn.exists()
# reset workflow counter
self.workflow_reset()
self.workflow(f'INPUT: file={fn.name}')
txt = fn.read_text(encoding='utf-8')
stxt = txt.strip().split('\n')
# tags are in a comment in the last line
tags = stxt[-1].strip()
if tags[:4] != '<!--':
self.workflow('INPUT: NO TAGS using NOC')
tags = 'NOC'
else:
tags = tags.replace('<!--', '').replace('-->', '')
tags = '-'.join([i.strip().replace(' ', '_')
for i in tags.split(',')])
# second comment is a particular post date
date = stxt[-2].strip()
if date[:10] == "<!-- date:":
date = date[10:-4].strip()
else:
date = ''
if date != '':
post_name = f'{fn.stem}---{date}-{tags}.html'
else:
post_name = f'{fn.stem}---{datetime.now():%Y-%m-%d}-{tags}.html'
post_full_name = self.web_path / post_name
self.workflow(f'INPUT: post={post_name}')
# duplicates: same file name, any date, any tags
pattern_name = f'{fn.stem}---????-??-??-*.html'
matching_files = list(self.web_path.glob(pattern_name))
if len(matching_files) > 0:
if self.update is True:
for f in matching_files:
if f.stat().st_mtime < fn.stat().st_mtime:
self.workflow(
f'UPDATE: Updating {f.name} an older HTML file for {fn.name}')
elif f.stat().st_mtime >= fn.stat().st_mtime:
logger.warning(
f'Skipping newer HTML file {f.name} for {fn.name} already exists')
return 0
else:
# this filename does not exist...but it was globbed
logger.warning(
f'Previous day creating today HTML file for {fn}??')
raise ValueError('This should be impossible')
else:
logger.warning(
f'EXITING: update==False but {len(matching_files)} matching blog HTML file(s) for {fn} exist(s)')
return 0
# else
# still need to create the post...hence carry on
# * expand all @@@s (per markdown_make)
txt = self.process_includes(txt=txt, fn=fn)
# now have the fully built source file
# * expand all basic tex macros
txt = self.process_tex_macros(txt, False)
# * deal with other graphics (files must be created separately)
txt = self.adjust_image_links(txt)
# * deal with TikZ pictures and figures
tikz = TikzManager(raw_input=txt, doc_path=fn, tex_engine=self.tex_engine)
tikz.process_tikz()
txt = tikz.raw_input
self._workflow.extend(tikz._workflow)
# make the name for the temp file
fn = fn.with_suffix('.me')
# pandoc command (want in the workflow)
command = ['pandoc', '-f', 'markdown', '-t', 'html', '-o', str(post_full_name),
'--highlight-style=pygments', '--mathjax', '--citeproc',
f'--bibliography={Path.home()}/S/TELOS/biblio/library.bib', str(fn)]
# append source information as a comment
self.workflow(f'BIBLIO: {Path.home()}/S/TELOS/biblio/library.bib')
self.workflow(f'PROCESS: creating temp file {fn}')
cmd = " ".join(command)
Path('make_last.bat').write_text(cmd, encoding='utf-8')
self.workflow(f'PROCESS: pandoc processing with {cmd}')
self.workflow('PROCESS: use make_last.bat to re-run final step')
# * write provenance of file
txt = txt + self.workflow_get()
# workflow is closed now...!
# * Save .bak file
# finally, write the temp file!
fn.write_text(txt, encoding='utf-8')
if self.dry_run:
# logger.info(f'{fn} --> {post_full_name.name}')
logger.info('Dry run...not executing...existing.')
logger.info(f'See {fn.name} file for edits and changes.')
return 0
# * pandoc create HTML file
logger.info(f'Pandoc execution on {fn.name}')
with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
line1 = p.stdout.read()
line2 = p.stderr.read()
if line1:
logger.info(line1)
if line2:
logger.error(line2)
exit_code = p.poll()
return exit_code
[docs] def publish_dir(self, pattern='*.md', tidy=True):
"""
Publish all files matching ``pattern`` to web_path
"""
for fn in self.source_dir.glob(pattern):
self.publish_file(fn, tidy=tidy)
[docs] def adjust_image_links(self, txt):
"""
Convert pdf figure links. DOES NOT MAKE the new images (that needs pdf2image (Linux)); it looks
for linkely contenders and selects one.
Completely separate from dealing with tikz.
Looks in the same folder for an appropriate non-pdf version of the file: prefers SVG then PNG then JPG.
If no file found then sets link to ``default`` format and it is up to you to create that file (noted
in the workflow).
Note, these file names are futher tinkered to move them to the website static folder.
See git history for an attempt to use divsvgm -P filename conversion...but those svg files do not
render.
"""
# need to look for images and copy them over; _file_renamer does a lot of work
if txt.find('![') < 0:
self.workflow('No image links found')
# original pattern:
# txt = re.sub(r'(?:!\[(?:.|\n)*?\])\((.*)\)', self._file_renamer, txt)
# find candidates - lock in since you will be changing txt
matches = list(re.findall(
r'(!\[((?:.|\n)*?)\]\((.+?)\))(\{.*?\})?', txt))
for whole_match, caption, file_name, classes in matches:
image_file = self.source_dir / file_name
if file_name[:4] == 'http':
# external link - not adjusted
self.workflow(f'IMAGE: External link unadjusted: {file_name}')
continue
elif image_file.exists() is True and image_file.suffix != '.pdf':
self.workflow(f'IMAGE: Non PDF link unadjusted: {file_name}')
new_file = image_file
elif image_file.exists() is False:
# this is just a general problem...should not occur often
self.workflow(
f'IMAGE: Image file does not exist: leaving link unchanged for {file_name}')
continue
else:
# file exists and is a pdf...we find a replacement (default leave as PDF)
# look for candidate replacement file
for kind in ['.svg', '.png', '.jpg']:
new_file = image_file.with_suffix(kind)
if new_file.exists() and new_file.stat().st_mtime >= image_file.stat().st_mtime:
new_file = image_file.with_suffix(kind)
break
else:
# did not find an alternative, but still need to copy the pdf over
# new_file = image_file
# OK, make an SVG...
new_file = image_file.with_suffix('.svg')
self.workflow(
f'IMAGE: Creating svg file for {image_file.name} (using new pdf2svg util)')
# https://github.com/jalios/pdf2svg-windows
command = [
'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg', str(image_file), str(new_file)]
self.run_command(command)
# copy over new file, which by construction must exist
# create link to new file
web_file = (self.static_img_path / new_file.name)
if web_file.exists():
web_file.unlink()
# safe rather than sorry on re-creating the link
self.workflow(f'IMAGE: Creating link {web_file} for {file_name}')
# Path.link_to(web_file) syntax: make web_file a hard link to this path.
new_file.link_to(web_file)
# link for the website, relative to the base of the blog
link_name = self.web_link(web_file)
# finally, have to adjust the link name and add 100% width ; classes includes the braces
txt = txt.replace(f'({file_name}){classes}',
f'({link_name}){{width=100%}}')
self.workflow(
f'IMAGE: txt image link ![]({file_name}) replaced with ![...]({link_name})')
if classes == '':
self.workflow('IMAGE:>>>class {{width=100%}} added')
else:
self.workflow(
f'IMAGE:>>>class {classes} replaced with {{width=100%}}')
return txt
[docs]class TikzManager(PublisherBase):
_tex_template = """\\documentclass[border=5mm]{{standalone}}
% needs lualatex - uncomment for Wiley fonts
\\usepackage{{fontspec}}
\\setmainfont{{Stix Two Text}}
\\usepackage{{unicode-math}}
\\setmathfont{{Stix Two Math}}
\\usepackage{{url}}
\\usepackage{{tikz}}
\\usepackage{{color}}
\\usetikzlibrary{{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}}
\\usetikzlibrary{{automata}}
\\usetikzlibrary{{fit}}
\\usetikzlibrary{{snakes}}
\\usetikzlibrary{{intersections}}
\\usetikzlibrary{{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}}
\\usetikzlibrary{{decorations.fractals,decorations.footprints}}
\\usetikzlibrary{{graphs}}
\\usetikzlibrary{{matrix}}
\\usetikzlibrary{{shapes.geometric}}
\\usetikzlibrary{{mindmap, shadows}}
\\usetikzlibrary{{backgrounds}}
\\usetikzlibrary{{cd}}
% really common macros
\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}
\\def\\dfrac{{\\displaystyle\\frac}}
\\def\\dint{{\\displaystyle\\int}}
\\begin{{document}}
{tikz_begin}{tikz_code}{tikz_end}
\\end{{document}}
"""
[docs] def __init__(self, *, raw_input='', doc_path=None, tex_engine='pdflatex'):
"""
Convert tikz figures in input text (raw_input) or a file (doc_path) into stand-alone svg files,
saved in web_path (usually the static/img folder).
If raw_input == '' then it is read from doc_path.
doc_path is used to determine if temp .tex files need updating.
When called by BlogPublisher, doc_path text has already been adjusted, hence raw_input.
When called stand-alone raw_input==''.
"""
if type(doc_path) == str:
self.doc_path = Path(doc_path)
elif doc_path is not None:
self.doc_path = doc_path
if raw_input != '':
self.raw_input = raw_input
else:
self.raw_input = doc_path.read_text(encoding='utf-8')
if doc_path is None:
# make a temp .tex filename
self.doc_path = self.file_name(self.raw_input)
self.tex_engine = tex_engine
super().__init__()
[docs] @staticmethod
def split_tikz(txt):
"""
Split text to get the tikzpicture. Format is
initial text pip then groups of four:
1. begin tag ``(1::4)``
2. tikz code ``(2::4)``
3. end tag ``(3::4)``
4. non-related text ``(4::4)``
"""
return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', txt)
[docs] def list_tikz(self):
"""
List the figures in doc_fn
"""
return self.split_tikz(self.raw_input)[2::4]
[docs] def process_tikz(self):
"""
Process the tikz figures/tables/sidewaystables in the doc into svg files.
"""
all_containers = self.split_figures()
begin_tags = iter(all_containers[1::4])
outer_codes = iter(all_containers[2::4])
end_tags = iter(all_containers[3::4])
# next_blob = iter(all_containers[4::4])
for i, begin_tag, outer_code, end_tag in zip(count(), begin_tags, outer_codes, end_tags):
# find tikzpicture, tikzcd etc.
if outer_code.find('\\begin{tikz') >= 0:
# container contains a tikzpicture
caption = re.search(
r'\\caption\{((?:.|\n)*?)\}\n', outer_code, flags=re.MULTILINE)
if caption is None:
caption = ''
else:
caption = caption[1]
# adjust the original doc; will create a tex file, tex it to pdf, create svg file,
# link the svg file into web (and keep a local copy).
svg_path = self.doc_path.with_suffix(
f'.{self.string_hash(outer_code)}.{i}.svg')
tex_path = svg_path.with_suffix('.tex')
web_path = self.static_img_path / svg_path.name
# this is a string link for the output doc
web_link = self.web_link(web_path)
if begin_tag.find('figure') > 0:
lbl = '*Figure:*'
else:
lbl = '*Table:*'
self.raw_input = self.raw_input.replace(
f'{begin_tag}{outer_code}{end_tag}',
f"\n\n![{lbl} {caption}]({web_link}){{width=100%}}\n\n"
)
# do not have to worry about existing classes - this was a figure or table...
self.workflow(
f'TIKZ: replaced text for {begin_tag}...{end_tag} with ![...]({web_link})')
# process if the svg files is older than doc_path
# Assumes that you don't tinker with links...
# is True and svg_path.stat().st_mtime >= self.doc_path.stat().st_mtime:
if svg_path.exists():
self.workflow(
f'TIKZ: using existing svg file for Tikz #{i}, {svg_path.name}')
else:
# make tex code for a stand-alone document
tikz_begin, tikz_code, tikz_end = self.split_tikz(outer_code)[
1:4]
tex_code = self._tex_template.format(
tikz_begin=tikz_begin, tikz_code=tikz_code, tikz_end=tikz_end)
tex_path.write_text(tex_code, encoding='utf-8')
self.workflow(
f'TIKZ: diagram #{i}, created temp file = {tex_path.name}')
pdf_file = tex_path.with_suffix('.pdf')
self.workflow(f'TIKZ: Update pdf file for Tikz #{i}')
if self.tex_engine == 'pdflatex':
# faster with template
# TODO EVID hard coded template
template = str(Path.home() / 'S/TELOS/Blog/format/tikz.fmt')
command = ['pdflatex', f'--fmt={template}', str(tex_path)]
else:
# for STIX fonts, no template
command = ['lualatex', str(tex_path)]
self.workflow(f'TIKZ: TeX Command={" ".join(command)}')
self.run_command(command)
self.workflow(
f'TIKZ: Creating svg file for Tikz #{i} (using new pdf2svg util)')
# https://github.com/jalios/pdf2svg-windows
command = [
'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg', str(pdf_file), str(svg_path)]
# seems to return info on stderr?
self.run_command(command, flag=False)
# create a nice name version of the svg file
if str(web_path) != str(svg_path):
if web_path.exists():
web_path.unlink()
svg_path.link_to(web_path)
self.workflow(
f'TIKZ: Linking {web_path} pointing to {svg_path} for Tikz #{i}')
# command line related
[docs]def setup_parser():
"""
Set up all command line options and return parser
:return: parser object
"""
parser = argparse.ArgumentParser(
description='BlogManager: create and manage blog posts. All posted to the default Blog website (global variable).',
epilog='Examples: (1) python -m blog_tools -a post_file -f *.md posts all markdown files in the current directory. '
'(2) python -m blog_tools -d new_posts -a post_dir posts all markdown files in the directory new_posts. '
'(3) python -m blog_tools -a convert -c *.pdf converts all pdf files in the current directory to '
'200 dpi PNGs (the defaults, set with --dpi and --format.'
)
# Debug group and general control
parser.add_argument('-y', '--dry_run', action="store_true",
help='dry_run mode: nothing actually done.')
parser.add_argument('-d', '--directory', action='store', type=str, default='',
metavar='SOURCE_DIRECTORY_NAME',
help='Source directory for files, default is cwd.')
# parser.add_argument('-w', '--web_directory', action='store', type=str, default='',
# metavar='WEBSITE_DIRECTORY_NAME',
# help=f'Destination directory for posts. Defaults to {BLOG_PATH}')
parser.add_argument('-u', '--update', action="store_true",
help='Update mode: only update files where md is newer than html.')
action_list = ['post_file', 'post_dir', 'convert']
parser.add_argument('-a', '--action', action='store', choices=action_list,
help='Determines the action: post a file, directory, or run pdf converter (Linux only).')
# post related
engine_list = ['pdflatex', 'lualatex']
parser.add_argument('-t', '--tex', action='store', choices=engine_list, default='pdflatex',
help='Specify TeX engine. pdflatex = fast, no fonts; lualatex = slow with fonts.')
parser.add_argument('-f', '--files', action='store', type=str, default='',
metavar='FILE_PATTERN',
help='Files filtered matching FILE_PATTERN. For post or convert. Can be a single filename.')
parser.add_argument('-r', '--refresh', action="store_true",
help='Refresh server issuing a curl http://127.0.0.1:5000/blog/reset command.')
# convert related
parser.add_argument('-c', '--convert', action='store', type=str, default='*.pdf',
metavar='CONVERT_FILE_PATTERN',
help='Convert all files in current directory matching CONVERT_FILE_PATTERN to FORMAT. '
'Run from Linux (smve38_clean). For example, to convert img/*.pdf '
'python -m blog_tools -a convert -d img --format=*.pdf. Converted files are written '
'to the same directory. ')
parser.add_argument('--format', action='store', type=str, default='png',
metavar='FORMAT',
help='Set output file type FORMAT for convert.')
parser.add_argument('--dpi', action='store', type=int, default=200,
metavar='DPI',
help='Set DPI level for convert.')
# other examples
# parser.add_argument('-l', '--limit', action="store", type=int, default=10,
# help='Meta: download limit number of posts.')
# parser.add_argument('-i', '--images', action="store_true",
# help='Meta: download associated images.')
return parser
[docs]def main():
"""
handle command line operation
needs to be a function for sphinx argparse.
:return:
"""
logging.basicConfig(level=logging.DEBUG)
parser = setup_parser()
args = parser.parse_args()
if args.directory == '':
source_dir = Path('.')
else:
source_dir = Path(args.directory)
# make blog publisher object
bp = BlogPublisher(source_dir, update=args.update, dry_run=args.dry_run)
if args.action == 'post_file':
# just those matching args.post pattern
for f in source_dir.glob(args.files):
bp.publish_file(f)
elif args.action == 'post_dir':
bp.publish_dir()
elif args.action == 'convert':
bp.convert_pdfs(source_dir, source_dir, pattern=args.files,
format=args.format, dpi=args.dpi)
if args.refresh is True:
command = ['curl', 'http://127.0.0.1:5000/blog/reset']
PublisherBase.run_command(command, flag=False)
if __name__ == '__main__':
main()