Source code for blog_tools

# BlogManager
# BlogPublisher
# TikzManager
# ImageManager

from titlecase import titlecase
import pandas as pd
from datetime import datetime
from itertools import count
from bs4 import BeautifulSoup
import argparse
from pathlib3x import Path
from subprocess import Popen, PIPE
import hashlib
import logging
import re
import numpy as np

logger = logging.getLogger(__name__)

try:
    from pdf2image import convert_from_path
    has_convert_from_path = True
except ModuleNotFoundError:
    # logger.warning('No pdf2image...cannot convert PDF files to png.')
    has_convert_from_path = False

# blog website files
BLOG_PATH = Path.home() / 'new_mynl/blog'

# folder for created images (keep separate)
STATIC_IMAGE_PATH = (BLOG_PATH / '../static/blog_img').resolve()

# this is not printing??
# poor mans
# logger = type('asd', (), {})
# logger.info = print
# logger.warning = print
# logger.debug = print
# logger.error = print


[docs]class BlogManager(object): """ BlogManager is used by the flask_app to do all the content management for the blog. Flask app handles the actual rendering, but all reports and extracts come from BlogManager. """
[docs] def __init__(self, post_dir=''): """ """ if post_dir == '': self.post_dir = BLOG_PATH elif isinstance(post_dir, Path) is False: self.post_dir = Path(post_dir) else: self.post_dir = post_dir self._blog_df = None self._tag_to_post_df = None self._top_tags = None
[docs] def refresh(self): """ Force cache update """ self._blog_df = None self._tag_to_post_df = None
@property def blog_df(self): if self._blog_df is None: self._blog_df = self.blog_entries_to_df(self.post_dir) return self._blog_df @property def tag_to_post_df(self): if self._tag_to_post_df is None: self._tag_to_post_df = pd.DataFrame(columns=['tag', 'hash']) i = count() for n, x in self.blog_df.iterrows(): for t in x.tag_list: self._tag_to_post_df.loc[next(i)] = [t, n] return self._tag_to_post_df @property def top_tags(self): if self._top_tags is None: self._top_tags = self.tag_to_post_df.groupby('tag').count(). \ reset_index(drop=False).sort_values(['hash', 'tag'], ascending=[False, True]). \ set_index('tag') return self._top_tags @property def top_tag_menu(self): """ Menu items for top [5] tags """ ans = [] dd_ans = [] for n, r in self.top_tags.head(5).iterrows(): ans.append( f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n}</a></li>' # to include number of posts in parens # f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n} ({r["hash"]})</a></li>' ) for n, r in self.top_tags.iloc[5:].iterrows(): dd_ans.append( f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>' # f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>' ) return '\n'.join(ans), '\n'.join(dd_ans)
[docs] def search_tag(self, regex): """ Search tags (list of individual tags) using a regex Only finds one tag type """ # tags --> hashes idx = self.tag_to_post_df['tag'].str.match(regex, flags=re.IGNORECASE) # get post hash_idx, note there may be duplicates hash_idx = set(self.tag_to_post_df.loc[idx].hash) return self.blog_df.loc[hash_idx]
[docs] def search_regex(self, regex, field): """ Search through field using a regex and return relevant posts tag, title, html, post_date, access, modify, create, size NOT tag_list, words (a set) ``field`` must be a column in ``tag_to_post_df`` Regex runs a contains query: you are responsible for start/finish """ # regex mode hash_idx = self.blog_df[field].str.contains(regex, flags=re.IGNORECASE) return self.blog_df.loc[hash_idx]
[docs] def search_query(self, query): """ Send well formed query to blog_df """ # query mode return self.blog_df.query(query)
def _search(self, query): """ Implements ! and and searching. :param query: :return: """ if query[0] == '!': matching_posts = self.search_regex(query[1:], 'words') list_label = f'Recent posts with words matching "{query[1:]}"' elif query.find(' and ') >= 0: queries = query.split(" and ") idx = [] for r in queries: match = self.search_tag(r) idx.append(match.index) ans = set(idx[0]) for j in idx[1:]: ans = ans.intersection(j) if len(ans) == 0: break # this works even if ans is empty matching_posts = self.blog_df.loc[ans] list_label = f'Recent posts tagged {query}' else: matching_posts = self.search_tag(query) list_label = f'Recent posts with tags matching "{query}"' return matching_posts, list_label @property def tags(self): """ Returns an iterable of distinct tags """ return self.tag_to_post_df['tag'].unique()
[docs] def make_card_list(self, tag): """ return six most recent posts, to be rendered in snapshot cards """ matching_posts, list_label = self._search(tag) row_iter = matching_posts.sort_values( 'modify_date', ascending=False).head(12).iterrows() card_list = [] try: for r in range(2): cards = [] card_list.append(cards) for i in range(3): h, x = next(row_iter) short_tags = self.tags_to_tag_links(x['tags'], glue=', ') c1 = dict(title=x['title'], text=x['summary'], hash=h, tags=short_tags, posted=x['post_date']) cards.append(c1) except StopIteration: # bomb out when you get to the end... # return BLOG.search_tag(tag).to_html() pass return card_list
[docs] def list_of_posts(self, regex): """ Return list of matching blog files formatted as a HTML list. Three modes: 1. regex is !str: run as a regex query against all words 2. regex contains " and ", split and run intersection of type 3 3. regix run as match against tag list. """ matching_posts, list_label = self._search(regex) posts = [] for n, r in matching_posts.sort_values(['modify_date', 'title'], ascending=(False, True)).iterrows(): posts.append(f'<li> <a href="/blog?id={n}" class="text-light">{r["post_date"]} | ' f'{r["title"]}</a>') posts = '\n'.join(posts) return posts, list_label
[docs] def report(self, kind): """ Create a report about the posts; returns html kind = title, tag, date, statistics """ renamer = {'post_date': 'Post date', 'title': 'Title', 'size': 'Size', 'tag': 'Tag', 'tags': 'Tags', 'avg_date': 'Avg date', 'number': 'Number', 'avg_size': 'Avg size', 'max_size': 'Max size', 'cross_tags': 'Num tags', 'modify_date': 'Modify date'} if kind == 'title': bit = self.blog_df[['post_date', 'modify_date', 'title', 'size', 'tags']] bit = bit.sort_values(['title', 'post_date'], ascending=[True, False]) # bit.index = range(1, len(bit)+1) # bit.index.name = 'post' bit.title = [ f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()] bit = bit[['title', 'post_date', 'size', 'tags']].rename( columns=renamer) s = bit.style.hide_index() s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]}, {'selector': 'th', 'props': [('text-align', 'left')]}]) return s.render() elif kind == 'date': bit = self.blog_df[['post_date', 'title', 'tags']] bit = bit.sort_values(['post_date', 'title'], ascending=[False, True]) # bit.index = range(1, len(bit)+1) # bit.index.name = 'post' bit.title = [ f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()] bit = bit[['post_date', 'title', 'tags']].rename(columns=renamer) s = bit.style.hide_index() s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]}, {'selector': 'th', 'props': [('text-align', 'left')]}]) return s.render() elif kind == 'modify': bit = self.blog_df[['modify_date', 'title', 'tags']] bit = bit.sort_values( ['modify_date', 'title'], ascending=[False, True]) # bit.index = range(1, len(bit)+1) # bit.index.name = 'post' bit.title = [ f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()] bit = bit[['modify_date', 'title', 'tags']].rename(columns=renamer) s = bit.style.format( formatter={'Modify date': "{:%Y-%m-%d}"}).hide_index() s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]}, {'selector': 'th', 'props': [('text-align', 'left')]}]) return s.render() elif kind == 'tag': bit = self.tag_to_post_df.merge(self.blog_df[['post_date', 'title']], left_on='hash', right_index=True, how='left') bit = bit.sort_values(['tag', 'title']) # bit.index = range(1, len(bit)+1) # bit.index.name = 'post' bit.title = [ f'<a href="/blog?id={h}">{i}</a>' for _, (h, i) in bit[['hash', 'title']].iterrows()] bit.tag = [f'<a href="/blog/{t}">{t}</a>' for t in bit['tag']] bit = bit[['tag', 'title', 'post_date']].rename(columns=renamer) s = bit.style.hide_index() s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]}, {'selector': 'th', 'props': [('text-align', 'left')]}]) return s.render() elif kind == 'statistics': bit = self.tag_to_post_df.merge( self.blog_df[['post_date', 'modify_date', 'size', 'title', 'summary', 'tag_list']], left_on='hash', right_index=True, how='left') # weird error gb = bit.groupby('tag').agg(number=('hash', np.size), avg_size=('size', np.mean), max_size=('size', np.max), avg_date=('modify_date', np.mean), cross_tags=('tag_list', lambda x: np.mean( [len(i) for i in x])) ).sort_values('number', ascending=False) gb['tag'] = [f'<a href="/blog/{i}">{i}</a>' for i in gb.index] gb = gb[['tag', 'avg_date', 'number', 'avg_size', 'max_size', 'cross_tags']]. \ sort_values(['number', 'avg_date'], ascending=[False, True]). \ rename(columns=renamer) # see https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html s = gb.style.format(formatter={'Avg date': "{:%Y-%m-%d}", 'Avg size': '{:,.0f}', 'Max size': '{:,.0f}', 'Num tags': '{:.1f}'}).hide_index() s = s.set_table_styles( [{'selector': 'td', 'props': [('text-align', 'right')]}]) return s.render() else: return '<h2>Unknown report type</h2>'
[docs] @staticmethod def blog_entries_to_df(p): """ Convert blog entries in directory p (a Path) to a dataframe. Parse tags, title, etc. * Must start with an h1; generally that is the only h1 in the document * A final comment with a list of tags; if there is no tag it is tagged NOC * A span, usually near the top, with class description that becomes the og summary and the card summary. If missing then the first 150 or so (to a word break) are used. * An image with class og_image that becomes the og image (not too large!) """ assert p.exists() posts = list(p.glob('*.html')) ans = [] for post in posts: # logger.info(f'Processing {post.stem}') try: _, title, date, tags, _ = re.split( r'(.*)\-\-\-(....\-..\-..)\-(.*)', post.stem) title = titlecase(title.replace('-', ' ')) tags = tags.replace('-', ', ').replace('_', ' ') txt = post.read_text(encoding='utf-8') stats = post.stat() access, modify, create = map(lambda x: pd.to_datetime(x, unit='s', utc=True).tz_convert('US/Eastern'), [stats.st_atime, stats.st_mtime, stats.st_ctime]) soup = BeautifulSoup(txt, "html.parser") # figure the description temp = soup('span', class_='description') if len(temp) > 0: summary = str(temp[0]) else: # no canned description extract_length = 35 n = 0 bit = [] for e in soup(['h2', 'h3', 'p', 'ul', 'ol', 'div', 'span']): # need to get something... if n == 0: if len(e.text) < extract_length: bit.append(e.text) else: temp1 = [] temp2 = 0 for w in e.text.split(): if temp2 + len(w) < extract_length: temp1.append(w) temp2 += 1 else: break # _ = ' '.join(temp1) # print('Appending', n, len(_), _) bit.append(' '.join(temp1)) break elif n + len(e.text) < extract_length: bit.append(e.text) n += len(e.text.split()) else: break summary = '\n'.join(bit) # figure an og_image temp = soup('img', id='og_image') if len(temp) > 0: og_image = temp[0]['src'] else: temp = soup('img') if len(temp) > 0: og_image = temp[0]['src'] else: og_image = '' h = hashlib.md5(post.stem.encode('utf-8')).digest().hex() ans.append([date, access, modify, create, stats.st_size, tags, tags.split(', '), title, txt, summary, og_image, str(post), h]) except ValueError: logger.error(f'{post}') df = pd.DataFrame(ans, columns=['post_date', 'access_date', 'modify_date', 'create_date', 'size', 'tags', 'tag_list', 'title', 'html', 'summary', 'og_image', 'path', 'hash']) df['words'] = df.html.str.replace( pat=r'(?=<!--)([\s\S]*?)-->|<.*?>', repl='', flags=re.MULTILINE) # bag of words with no punct and lower case # df['bow'] = df.words.apply(lambda x: set(re.sub(r'[,.?!{}()\[\]]', '', x.lower(), flags=re.MULTILINE).split())) # hash based on the FILE NAME rather than the content... allows changes when entry has a fixed date # created above from post # originally # df.index = [hashlib.md5(t.encode('utf-8')).digest().hex() for t in df.html] # df.index.name = 'hash' df = df.set_index('hash') # remove duplicates (TODO: this shouldn't happen!) keep the oldest version of each post...remember the text is the same df = df.sort_values(['modify_date', 'title'], ascending=[False, True]) dups = df.index.duplicated(keep='last') df = df.loc[~dups].copy() return df
[docs] @staticmethod def name_to_parts(fn): """ fn is a Path """ fn = fn.stem # date = fn[:10] tags, title = re.split(r'(.*?)\-\-\-(.*)', fn[11:])[1:-1] tags = tags.split('-') title = titlecase(title.replace('-', ' ')) return tags, title
[docs]class PublisherBase(object): """ Container for some static functions. Handles workflow tracking """ _macros = r"""\def\E{\mathsf{E}} \def\Var{\mathsf{Var}} \def\var{\mathsf{var}} \def\SD{\mathsf{SD}} \def\VaR{\mathsf{VaR}} \def\CTE{\mathsf{CTE}} \def\WCE{\mathsf{WCE}} \def\AVaR{\mathsf{AVaR}} \def\CVaR{\mathsf{CVaR}} \def\TVaR{\mathsf{TVaR}} \def\biTVaR{\mathsf{biTVaR}} \def\ES{\mathsf{ES}} \def\EPD{\mathsf{EPD}} \def\cov{\mathsf{cov}} \def\corr{\mathsf{Corr}} \def\Pr{\mathsf{Pr}} \def\ecirc{\accentset{\circ} e} \def\dsum{\displaystyle\sum} \def\dint{\displaystyle\int} \def\AA{\mathcal{A}} \def\bb{\bm{b}} \def\ww{\bm{w}} \def\xx{\bm{x}} \def\yy{\bm{y}} \def\HH{\bm{H}} \def\FFF{\mathscr{F}} \def\FF{\mathcal{F}} \def\MM{\mathcal{M}} \def\OO{\mathscr{O}} \def\PPP{\mathscr{P}} \def\PP{\mathsf{P}} \def\QQ{\mathsf{Q}} \def\RR{\mathbb{R}} \def\ZZ{\mathbb{Z}} \def\NN{\mathbb{N}} \def\XXX{\mathcal{X}} \def\XX{\bm{X}} \def\ZZZ{\mathcal{Z}} \def\bbeta{\bm{\beta}} \def\cp{\mathsf{CP}} \def\atan{\mathrm{atan}} \def\ecirc{\accentset{\circ} e} \def\tpx{{{}_tp_x}} \def\kpx{{{}_kp_x}} \def\tpy{{{}_tp_y}} \def\tpxy{{{}_tp_{xy}}} \def\tpxybar{{{}_tp_{\overline{xy}}}} \def\tqx{{{}_tq_x}}"""
[docs] def __init__(self): # path for posts self.web_path = BLOG_PATH # path for images self.static_img_path = STATIC_IMAGE_PATH # log workflow (TODO: sad that logged items are detached from where they are called) self._workflow = [] self._n = 0
[docs] def workflow_reset(self): self._workflow = [] self._n = 0
[docs] def workflow(self, msg): """ Add a message to the workflow :param msg: :return: """ logger.info(msg) self._n += 1 self._workflow.append(f'({self._n:02d}) {msg}')
[docs] def workflow_show(self): print('\n'.join(self._workflow))
[docs] def workflow_get(self): """ Return the workflow object as an HTML comment """ nl = '\n' return f"\n\n<!--\n{nl.join(self._workflow)}\n-->\n"
[docs] def workflow_raw(self): return self._workflow
[docs] def process_includes(self, *, txt='', fn=None): """ Stand-alone process includes. txt = current status of buffer. fn = Path object source. If txt=None then text read from fm. If txt=='' then txt read from fn. This allows it to be used stand-alone. Not static because calls functions that access the workflow. But can be part of the base. :param txt: :param fn: :return: txt with includes resolved. """ if txt == '': txt = fn.read_text(encoding='utf-8') if txt.find('@@@') < 0: return txt # else have work to do if fn is None: fn = Path('.') base_dir = fn.parent.resolve() n_includes = 0 # first, substitute for all NNN specs (keep this for backwards compatibility) # assumes you are in the current directory file_map = {i[0:3]: i for i in base_dir.parent.glob("*.md")} txt, n_includes = self._process_includes( txt, base_dir, n_includes, file_map) self.workflow(f'IMPORT: {n_includes} files imported') return txt
def _process_includes(self, txt, base_dir, n_includes, file_map): """ Process @@@ include elements. From markdown_make.py without color_includes logic Iterative processing of include files file_map looks for nnn_something.md files in the current directory base_dir = directory name """ includes = re.findall( r'@@@include ([\./]*)([0-9]{3}|[0-9A-Za-z])([^\n]+\.[a-z]+)?', txt) for res_ in includes: original_match = ''.join(res_) # logger.info(res_, file_map) # res_[1] looks for nnn type files and tries to find them in file_map if res_[2] == '': res = file_map[res_[1]] # logger.info(f'REPLACING {res_} with {res}') else: res = original_match # logger.info(f'using {"".join(res_)} as {res}') self.workflow(f'IMPORT: Importing {res}') n_includes += 1 try: repl = (base_dir / res).read_text(encoding='utf-8') repl = self._strip_yaml(repl) repl, n_includes = self._process_includes( repl, base_dir, n_includes, file_map) txt = txt.replace(f'@@@include {original_match}', repl) except FileNotFoundError: self.workflow( f'IMPORT: WARNING @@@ included file {res} not found...ignoring') return txt, n_includes def _strip_yaml(self, text): """ Strip starging yaml, between first --- and next --- from text. Applies to included files. From markdown_make.py. :param text: :return: """ if text[:3] != '---': return text else: self.workflow('Stripped YAML') stext = text.split('\n') stext.pop(0) n = 0 for ln in stext: if ln != '---': n += 1 else: n += 1 return '\n'.join(stext[n:])
[docs] def process_tex_macros(self, md_in, report=False): """ Expand standard general.tex macros in the md_in text blog If ``additional_macros is not None`` then use it to update the standard list If ``report is True`` then just return the dictionary of macro substitutions """ m, regex = PublisherBase.tex_to_dict(PublisherBase._macros) if report is True: return m, regex md_in, n = re.subn(regex, lambda x: m.get( x[0]), md_in, flags=re.MULTILINE) self.workflow(f'MACROS: {n} TeX macros substitutions') # lcroof is not handled return md_in
[docs] @staticmethod def file_name(s): """ Create a sensible random file name from a string s :param s: :return: """ return Path('TMP_' + PublisherBase.string_hash(s) + '.tex')
[docs] @staticmethod def string_hash(s): """ Return hash of string s, as a hex string :param s: :return: """ return hashlib.md5(s.encode('utf-8')).digest().hex()
[docs] @staticmethod def run_command(command, flag=True): """ Run a command and show results. Allows for weird xx behavior :param command: :param flag: :return: """ with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p: line1 = p.stdout.read() line2 = p.stderr.read() exit_code = p.poll() if line1: logger.info(line1[-250:]) if line2: if flag: raise ValueError(line2) else: logger.info(line2) return exit_code
[docs] @staticmethod def tidy(): """ tidy up the cwd :return: """ for pattern in ['TMP_*.*', '*.bak', '*.log', '*.aux', '*.me']: for f in Path('.').glob(pattern): logger.info(f'unlinking {f}') f.unlink()
[docs] @staticmethod def convert_pdfs(dir_name, output_folder='', pattern='*.pdf', format='png', dpi=200, transparent=True): """ Bulk conversion of all pdfs in dir_name to png. Linux (pdf2image) only. Pre-run! Does not adjust names in the text. """ if type(dir_name) == str: dir_name = Path(dir_name) if output_folder == '': output_folder = dir_name for f in dir_name.glob(pattern): fo = f.stem logger.info(f'converting {f.name} to {fo}') convert_from_path(str(f), dpi=dpi, output_folder=output_folder, fmt=format, transparent=transparent, output_file=fo, single_file=True)
[docs] @staticmethod def tex_to_dict(text): """ Convert text, a series of def{} macros into a dictionary returns the dictionary and the regex of all keys """ smacros = text.split('\n') smacros = [BlogPublisher.tex_splitter(i) for i in smacros] m = {i: j for (i, j) in smacros} regex = '|'.join([re.escape(k) for k in m.keys()]) return m, regex
[docs] @staticmethod def tex_splitter(x): """ x is a single def style tex macro """ x = x.replace('\\def', '') i = x.find('{') return x[:i], x[i + 1:-1]
[docs] @staticmethod def post_tags_and_dates(dir_path): """ Read info from a set of proto posts :param dir_path: :return: """ if isinstance(dir_path, Path) is False: dir_path = Path(dir_path) ans = [] for f in dir_path.glob('*.md'): t = f.read_text() st = t.strip().split('\n') tags = st[-1].strip() tags = tags.replace("<!--", "").replace("-->", "").strip() date = st[-2].strip() if date[:10] == "<!-- date:": date = date[10:-4].strip() else: date = '' ans.append([f.name, date, tags]) setup = pd.DataFrame(ans, columns=['file', 'date', 'tags']) return setup
[docs]class BlogPublisher(PublisherBase):
[docs] def __init__(self, source_dir='.', update=False, dry_run=True, tex_engine='pdflatex'): """ Manage creation of HTML blog-post files, including creating and image files and changing links in Markdown. Objective is to published as-is files that create TeX on the web. Adjustments: PDF images to PND/JPG/SVG (change the link and create the PNG) and TikZ (create SVG file, find begin{figure} find caption and change Markdown). Adds a final comment to the HTML explaining where the file came from. Creates a .bak file with the same name and including all the edits. These SHOULD NEVER BE EDITED! web_path is the destination for the created HTML files. If =='' then BLOG_PATH is used If ``update`` is True overwrite existing older HTML files, otherwise skip if exists. If ``dry_run`` is True just explain what would happen. Note: defaults in fail safe mode! The Markdown file can optionally have: * A final comment with a list of tags; if there is no tag it is tagged NOC * A span, usually near the top, with class description that becomes the og summary and the card summary. If missing then the first 150 or so (to a word break) are used. * An image with class og_image that becomes the og image (not too large!) These elements are used by BlogManager. :param source_dir: source directory for files to publish, default is cwd :param web_path: website directory; files are published here :param update: :param dry_run: :param tex_engine: pdflatex (fast but not fonts) or lualatex (slow but change fonts) """ if isinstance(source_dir, Path): self.source_dir = source_dir else: self.source_dir = Path(source_dir).resolve() self.update = update self.dry_run = dry_run self.tex_engine = tex_engine super().__init__()
[docs] def publish_file(self, fn): """ fn is a markdown file to post file, a str or Path object. Workflow is * read markdown, split, find tags (last comment), make post filename * check timing and existing blog post files to see if there are any updates * expand all @@@s (per markdown_make) * expand all basic tex macros * deal with pdf graphics (png/jpg/svg versions of pdf files must be created separately); if none is found, leave as pdf * deal with TikZ pictures and figures (after graphics because it introduces new ![] elements) * append workflow, including provenance of file * Save .bak file * pandoc create HTML file :param fn: name of file (or path to file) of markdown :param tex_engine: if pdflatex uses blog/format/tikz.fmt and pdflatex, giving default fonts. If lualatex runs without template, slower but will give the fonts. """ if isinstance(fn, str) is True: fn = self.source_dir / fn assert fn.exists() # reset workflow counter self.workflow_reset() self.workflow(f'INPUT: file={fn.name}') txt = fn.read_text(encoding='utf-8') stxt = txt.strip().split('\n') # tags are in a comment in the last line tags = stxt[-1].strip() if tags[:4] != '<!--': self.workflow('INPUT: NO TAGS using NOC') tags = 'NOC' else: tags = tags.replace('<!--', '').replace('-->', '') tags = '-'.join([i.strip().replace(' ', '_') for i in tags.split(',')]) # second comment is a particular post date date = stxt[-2].strip() if date[:10] == "<!-- date:": date = date[10:-4].strip() else: date = '' if date != '': post_name = f'{fn.stem}---{date}-{tags}.html' else: post_name = f'{fn.stem}---{datetime.now():%Y-%m-%d}-{tags}.html' post_full_name = self.web_path / post_name self.workflow(f'INPUT: post={post_name}') # duplicates: same file name, any date, any tags pattern_name = f'{fn.stem}---????-??-??-*.html' matching_files = list(self.web_path.glob(pattern_name)) if len(matching_files) > 0: if self.update is True: for f in matching_files: if f.stat().st_mtime < fn.stat().st_mtime: self.workflow( f'UPDATE: Updating {f.name} an older HTML file for {fn.name}') elif f.stat().st_mtime >= fn.stat().st_mtime: logger.warning( f'Skipping newer HTML file {f.name} for {fn.name} already exists') return 0 else: # this filename does not exist...but it was globbed logger.warning( f'Previous day creating today HTML file for {fn}??') raise ValueError('This should be impossible') else: logger.warning( f'EXITING: update==False but {len(matching_files)} matching blog HTML file(s) for {fn} exist(s)') return 0 # else # still need to create the post...hence carry on # * expand all @@@s (per markdown_make) txt = self.process_includes(txt=txt, fn=fn) # now have the fully built source file # * expand all basic tex macros txt = self.process_tex_macros(txt, False) # * deal with other graphics (files must be created separately) txt = self.adjust_image_links(txt) # * deal with TikZ pictures and figures tikz = TikzManager(raw_input=txt, doc_path=fn, tex_engine=self.tex_engine) tikz.process_tikz() txt = tikz.raw_input self._workflow.extend(tikz._workflow) # make the name for the temp file fn = fn.with_suffix('.me') # pandoc command (want in the workflow) command = ['pandoc', '-f', 'markdown', '-t', 'html', '-o', str(post_full_name), '--highlight-style=pygments', '--mathjax', '--citeproc', f'--bibliography={Path.home()}/S/TELOS/biblio/library.bib', str(fn)] # append source information as a comment self.workflow(f'BIBLIO: {Path.home()}/S/TELOS/biblio/library.bib') self.workflow(f'PROCESS: creating temp file {fn}') cmd = " ".join(command) Path('make_last.bat').write_text(cmd, encoding='utf-8') self.workflow(f'PROCESS: pandoc processing with {cmd}') self.workflow('PROCESS: use make_last.bat to re-run final step') # * write provenance of file txt = txt + self.workflow_get() # workflow is closed now...! # * Save .bak file # finally, write the temp file! fn.write_text(txt, encoding='utf-8') if self.dry_run: # logger.info(f'{fn} --> {post_full_name.name}') logger.info('Dry run...not executing...existing.') logger.info(f'See {fn.name} file for edits and changes.') return 0 # * pandoc create HTML file logger.info(f'Pandoc execution on {fn.name}') with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p: line1 = p.stdout.read() line2 = p.stderr.read() if line1: logger.info(line1) if line2: logger.error(line2) exit_code = p.poll() return exit_code
[docs] def publish_dir(self, pattern='*.md', tidy=True): """ Publish all files matching ``pattern`` to web_path """ for fn in self.source_dir.glob(pattern): self.publish_file(fn, tidy=tidy)
[docs]class TikzManager(PublisherBase): _tex_template = """\\documentclass[border=5mm]{{standalone}} % needs lualatex - uncomment for Wiley fonts \\usepackage{{fontspec}} \\setmainfont{{Stix Two Text}} \\usepackage{{unicode-math}} \\setmathfont{{Stix Two Math}} \\usepackage{{url}} \\usepackage{{tikz}} \\usepackage{{color}} \\usetikzlibrary{{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}} \\usetikzlibrary{{automata}} \\usetikzlibrary{{fit}} \\usetikzlibrary{{snakes}} \\usetikzlibrary{{intersections}} \\usetikzlibrary{{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}} \\usetikzlibrary{{decorations.fractals,decorations.footprints}} \\usetikzlibrary{{graphs}} \\usetikzlibrary{{matrix}} \\usetikzlibrary{{shapes.geometric}} \\usetikzlibrary{{mindmap, shadows}} \\usetikzlibrary{{backgrounds}} \\usetikzlibrary{{cd}} % really common macros \\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}} \\def\\dfrac{{\\displaystyle\\frac}} \\def\\dint{{\\displaystyle\\int}} \\begin{{document}} {tikz_begin}{tikz_code}{tikz_end} \\end{{document}} """
[docs] def __init__(self, *, raw_input='', doc_path=None, tex_engine='pdflatex'): """ Convert tikz figures in input text (raw_input) or a file (doc_path) into stand-alone svg files, saved in web_path (usually the static/img folder). If raw_input == '' then it is read from doc_path. doc_path is used to determine if temp .tex files need updating. When called by BlogPublisher, doc_path text has already been adjusted, hence raw_input. When called stand-alone raw_input==''. """ if type(doc_path) == str: self.doc_path = Path(doc_path) elif doc_path is not None: self.doc_path = doc_path if raw_input != '': self.raw_input = raw_input else: self.raw_input = doc_path.read_text(encoding='utf-8') if doc_path is None: # make a temp .tex filename self.doc_path = self.file_name(self.raw_input) self.tex_engine = tex_engine super().__init__()
[docs] @staticmethod def split_tikz(txt): """ Split text to get the tikzpicture. Format is initial text pip then groups of four: 1. begin tag ``(1::4)`` 2. tikz code ``(2::4)`` 3. end tag ``(3::4)`` 4. non-related text ``(4::4)`` """ return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', txt)
[docs] def split_figures(self): return re.split(r'(\\begin{figure}|\\begin{sidewaysfigure}|\\begin{table}|' r'\\end{figure}|\\end{sidewaysfigure}|\\end{table})', self.raw_input)
[docs] def list_tikz(self): """ List the figures in doc_fn """ return self.split_tikz(self.raw_input)[2::4]
[docs] def process_tikz(self): """ Process the tikz figures/tables/sidewaystables in the doc into svg files. """ all_containers = self.split_figures() begin_tags = iter(all_containers[1::4]) outer_codes = iter(all_containers[2::4]) end_tags = iter(all_containers[3::4]) # next_blob = iter(all_containers[4::4]) for i, begin_tag, outer_code, end_tag in zip(count(), begin_tags, outer_codes, end_tags): # find tikzpicture, tikzcd etc. if outer_code.find('\\begin{tikz') >= 0: # container contains a tikzpicture caption = re.search( r'\\caption\{((?:.|\n)*?)\}\n', outer_code, flags=re.MULTILINE) if caption is None: caption = '' else: caption = caption[1] # adjust the original doc; will create a tex file, tex it to pdf, create svg file, # link the svg file into web (and keep a local copy). svg_path = self.doc_path.with_suffix( f'.{self.string_hash(outer_code)}.{i}.svg') tex_path = svg_path.with_suffix('.tex') web_path = self.static_img_path / svg_path.name # this is a string link for the output doc web_link = self.web_link(web_path) if begin_tag.find('figure') > 0: lbl = '*Figure:*' else: lbl = '*Table:*' self.raw_input = self.raw_input.replace( f'{begin_tag}{outer_code}{end_tag}', f"\n\n![{lbl} {caption}]({web_link}){{width=100%}}\n\n" ) # do not have to worry about existing classes - this was a figure or table... self.workflow( f'TIKZ: replaced text for {begin_tag}...{end_tag} with ![...]({web_link})') # process if the svg files is older than doc_path # Assumes that you don't tinker with links... # is True and svg_path.stat().st_mtime >= self.doc_path.stat().st_mtime: if svg_path.exists(): self.workflow( f'TIKZ: using existing svg file for Tikz #{i}, {svg_path.name}') else: # make tex code for a stand-alone document tikz_begin, tikz_code, tikz_end = self.split_tikz(outer_code)[ 1:4] tex_code = self._tex_template.format( tikz_begin=tikz_begin, tikz_code=tikz_code, tikz_end=tikz_end) tex_path.write_text(tex_code, encoding='utf-8') self.workflow( f'TIKZ: diagram #{i}, created temp file = {tex_path.name}') pdf_file = tex_path.with_suffix('.pdf') self.workflow(f'TIKZ: Update pdf file for Tikz #{i}') if self.tex_engine == 'pdflatex': # faster with template # TODO EVID hard coded template template = str(Path.home() / 'S/TELOS/Blog/format/tikz.fmt') command = ['pdflatex', f'--fmt={template}', str(tex_path)] else: # for STIX fonts, no template command = ['lualatex', str(tex_path)] self.workflow(f'TIKZ: TeX Command={" ".join(command)}') self.run_command(command) self.workflow( f'TIKZ: Creating svg file for Tikz #{i} (using new pdf2svg util)') # https://github.com/jalios/pdf2svg-windows command = [ 'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg', str(pdf_file), str(svg_path)] # seems to return info on stderr? self.run_command(command, flag=False) # create a nice name version of the svg file if str(web_path) != str(svg_path): if web_path.exists(): web_path.unlink() svg_path.link_to(web_path) self.workflow( f'TIKZ: Linking {web_path} pointing to {svg_path} for Tikz #{i}')
# command line related
[docs]def setup_parser(): """ Set up all command line options and return parser :return: parser object """ parser = argparse.ArgumentParser( description='BlogManager: create and manage blog posts. All posted to the default Blog website (global variable).', epilog='Examples: (1) python -m blog_tools -a post_file -f *.md posts all markdown files in the current directory. ' '(2) python -m blog_tools -d new_posts -a post_dir posts all markdown files in the directory new_posts. ' '(3) python -m blog_tools -a convert -c *.pdf converts all pdf files in the current directory to ' '200 dpi PNGs (the defaults, set with --dpi and --format.' ) # Debug group and general control parser.add_argument('-y', '--dry_run', action="store_true", help='dry_run mode: nothing actually done.') parser.add_argument('-d', '--directory', action='store', type=str, default='', metavar='SOURCE_DIRECTORY_NAME', help='Source directory for files, default is cwd.') # parser.add_argument('-w', '--web_directory', action='store', type=str, default='', # metavar='WEBSITE_DIRECTORY_NAME', # help=f'Destination directory for posts. Defaults to {BLOG_PATH}') parser.add_argument('-u', '--update', action="store_true", help='Update mode: only update files where md is newer than html.') action_list = ['post_file', 'post_dir', 'convert'] parser.add_argument('-a', '--action', action='store', choices=action_list, help='Determines the action: post a file, directory, or run pdf converter (Linux only).') # post related engine_list = ['pdflatex', 'lualatex'] parser.add_argument('-t', '--tex', action='store', choices=engine_list, default='pdflatex', help='Specify TeX engine. pdflatex = fast, no fonts; lualatex = slow with fonts.') parser.add_argument('-f', '--files', action='store', type=str, default='', metavar='FILE_PATTERN', help='Files filtered matching FILE_PATTERN. For post or convert. Can be a single filename.') parser.add_argument('-r', '--refresh', action="store_true", help='Refresh server issuing a curl http://127.0.0.1:5000/blog/reset command.') # convert related parser.add_argument('-c', '--convert', action='store', type=str, default='*.pdf', metavar='CONVERT_FILE_PATTERN', help='Convert all files in current directory matching CONVERT_FILE_PATTERN to FORMAT. ' 'Run from Linux (smve38_clean). For example, to convert img/*.pdf ' 'python -m blog_tools -a convert -d img --format=*.pdf. Converted files are written ' 'to the same directory. ') parser.add_argument('--format', action='store', type=str, default='png', metavar='FORMAT', help='Set output file type FORMAT for convert.') parser.add_argument('--dpi', action='store', type=int, default=200, metavar='DPI', help='Set DPI level for convert.') # other examples # parser.add_argument('-l', '--limit', action="store", type=int, default=10, # help='Meta: download limit number of posts.') # parser.add_argument('-i', '--images', action="store_true", # help='Meta: download associated images.') return parser
[docs]def main(): """ handle command line operation needs to be a function for sphinx argparse. :return: """ logging.basicConfig(level=logging.DEBUG) parser = setup_parser() args = parser.parse_args() if args.directory == '': source_dir = Path('.') else: source_dir = Path(args.directory) # make blog publisher object bp = BlogPublisher(source_dir, update=args.update, dry_run=args.dry_run) if args.action == 'post_file': # just those matching args.post pattern for f in source_dir.glob(args.files): bp.publish_file(f) elif args.action == 'post_dir': bp.publish_dir() elif args.action == 'convert': bp.convert_pdfs(source_dir, source_dir, pattern=args.files, format=args.format, dpi=args.dpi) if args.refresh is True: command = ['curl', 'http://127.0.0.1:5000/blog/reset'] PublisherBase.run_command(command, flag=False)
if __name__ == '__main__': main()