Source code for blog_tools

# BlogManager
# BlogPublisher
# TikzManager
# ImageManager

from titlecase import titlecase
import pandas as pd
from datetime import datetime
from itertools import count
from bs4 import BeautifulSoup
import argparse
from pathlib3x import Path
from subprocess import Popen, PIPE
import hashlib
import logging
import re
import numpy as np

logger = logging.getLogger(__name__)

try:
    from pdf2image import convert_from_path
    has_convert_from_path = True
except ModuleNotFoundError:
    # logger.warning('No pdf2image...cannot convert PDF files to png.')
    has_convert_from_path = False

# blog website files
BLOG_PATH = Path.home() / 'new_mynl/blog'

# folder for created images (keep separate)
STATIC_IMAGE_PATH = (BLOG_PATH / '../static/blog_img').resolve()

# this is not printing??
# poor mans
# logger = type('asd', (), {})
# logger.info = print
# logger.warning = print
# logger.debug = print
# logger.error = print


[docs]class BlogManager(object):
    """
    BlogManager is used by the flask_app to do all the content management for the blog.
    Flask app handles the actual rendering, but all reports and extracts come from BlogManager.

    """

[docs]    def __init__(self, post_dir=''):
        """

        """
        if post_dir == '':
            self.post_dir = BLOG_PATH
        elif isinstance(post_dir, Path) is False:
            self.post_dir = Path(post_dir)
        else:
            self.post_dir = post_dir
        self._blog_df = None
        self._tag_to_post_df = None
        self._top_tags = None

[docs]    def refresh(self):
        """
        Force cache update
        """
        self._blog_df = None
        self._tag_to_post_df = None

    @property
    def blog_df(self):
        if self._blog_df is None:
            self._blog_df = self.blog_entries_to_df(self.post_dir)
        return self._blog_df

    @property
    def tag_to_post_df(self):
        if self._tag_to_post_df is None:
            self._tag_to_post_df = pd.DataFrame(columns=['tag', 'hash'])
            i = count()
            for n, x in self.blog_df.iterrows():
                for t in x.tag_list:
                    self._tag_to_post_df.loc[next(i)] = [t, n]
        return self._tag_to_post_df

    @property
    def top_tags(self):
        if self._top_tags is None:
            self._top_tags = self.tag_to_post_df.groupby('tag').count(). \
                reset_index(drop=False).sort_values(['hash', 'tag'],
                                                    ascending=[False, True]). \
                set_index('tag')
        return self._top_tags

    @property
    def top_tag_menu(self):
        """
        Menu items for top [5] tags
        """
        ans = []
        dd_ans = []
        for n, r in self.top_tags.head(5).iterrows():
            ans.append(
                f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n}</a></li>'
                # to include number of posts in parens
                # f'<li class="nav-item"> <a class="nav-link" href="/blog/{n}">{n} ({r["hash"]})</a></li>'
            )
        for n, r in self.top_tags.iloc[5:].iterrows():
            dd_ans.append(
                f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>'
                # f'<a class="dropdown-item" href="/blog/{n}">{n} ({r["hash"]})</a>'
            )
        return '\n'.join(ans), '\n'.join(dd_ans)

[docs]    def search_tag(self, regex):
        """
        Search tags (list of individual tags) using a regex
        Only finds one tag type
        """
        # tags --> hashes
        idx = self.tag_to_post_df['tag'].str.match(regex, flags=re.IGNORECASE)
        # get post hash_idx, note there may be duplicates
        hash_idx = set(self.tag_to_post_df.loc[idx].hash)
        return self.blog_df.loc[hash_idx]

[docs]    def search_regex(self, regex, field):
        """
        Search through field using a regex and return relevant posts
        tag, title, html, post_date, access, modify, create, size
        NOT tag_list, words (a set)

        ``field`` must be a column in ``tag_to_post_df``

        Regex runs a contains query: you are responsible for start/finish

        """
        # regex mode
        hash_idx = self.blog_df[field].str.contains(regex, flags=re.IGNORECASE)
        return self.blog_df.loc[hash_idx]

[docs]    def search_query(self, query):
        """
        Send well formed query to blog_df
        """
        # query mode
        return self.blog_df.query(query)

    def _search(self, query):
        """
        Implements ! and and searching.

        :param query:
        :return:
        """
        if query[0] == '!':
            matching_posts = self.search_regex(query[1:], 'words')
            list_label = f'Recent posts with words matching "{query[1:]}"'

        elif query.find(' and ') >= 0:
            queries = query.split(" and ")
            idx = []
            for r in queries:
                match = self.search_tag(r)
                idx.append(match.index)
            ans = set(idx[0])
            for j in idx[1:]:
                ans = ans.intersection(j)
                if len(ans) == 0:
                    break
            # this works even if ans is empty
            matching_posts = self.blog_df.loc[ans]
            list_label = f'Recent posts tagged {query}'
        else:
            matching_posts = self.search_tag(query)
            list_label = f'Recent posts with tags matching "{query}"'
        return matching_posts, list_label

    @property
    def tags(self):
        """
        Returns an iterable of distinct tags
        """
        return self.tag_to_post_df['tag'].unique()

[docs]    def make_card_list(self, tag):
        """
        return six most recent posts, to be rendered in snapshot cards

        """
        matching_posts, list_label = self._search(tag)

        row_iter = matching_posts.sort_values(
            'modify_date', ascending=False).head(12).iterrows()
        card_list = []
        try:
            for r in range(2):
                cards = []
                card_list.append(cards)
                for i in range(3):
                    h, x = next(row_iter)
                    short_tags = self.tags_to_tag_links(x['tags'], glue=', ')
                    c1 = dict(title=x['title'], text=x['summary'],
                              hash=h, tags=short_tags, posted=x['post_date'])
                    cards.append(c1)
        except StopIteration:
            # bomb out when you get to the end...
            # return BLOG.search_tag(tag).to_html()
            pass
        return card_list

[docs]    def list_of_posts(self, regex):
        """
        Return list of matching blog files formatted as a HTML list. Three modes:

        1. regex is !str: run as a regex query against all words
        2. regex contains " and ", split and run intersection of type 3
        3. regix run as match against tag list.

        """

        matching_posts, list_label = self._search(regex)

        posts = []
        for n, r in matching_posts.sort_values(['modify_date', 'title'], ascending=(False, True)).iterrows():
            posts.append(f'<li> <a href="/blog?id={n}" class="text-light">{r["post_date"]} | '
                         f'{r["title"]}</a>')

        posts = '\n'.join(posts)

        return posts, list_label

[docs]    def report(self, kind):
        """
        Create a report about the posts; returns html

        kind = title, tag, date, statistics

        """
        renamer = {'post_date': 'Post date', 'title': 'Title', 'size': 'Size', 'tag': 'Tag',
                   'tags': 'Tags', 'avg_date': 'Avg date', 'number': 'Number', 'avg_size': 'Avg size',
                   'max_size': 'Max size', 'cross_tags': 'Num tags', 'modify_date': 'Modify date'}
        if kind == 'title':
            bit = self.blog_df[['post_date',
                                'modify_date', 'title', 'size', 'tags']]
            bit = bit.sort_values(['title', 'post_date'],
                                  ascending=[True, False])
            # bit.index = range(1, len(bit)+1)
            # bit.index.name = 'post'
            bit.title = [
                f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
            bit = bit[['title', 'post_date', 'size', 'tags']].rename(
                columns=renamer)
            s = bit.style.hide_index()
            s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
                                    {'selector': 'th', 'props': [('text-align', 'left')]}])
            return s.render()

        elif kind == 'date':
            bit = self.blog_df[['post_date', 'title', 'tags']]
            bit = bit.sort_values(['post_date', 'title'],
                                  ascending=[False, True])
            # bit.index = range(1, len(bit)+1)
            # bit.index.name = 'post'
            bit.title = [
                f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
            bit = bit[['post_date', 'title', 'tags']].rename(columns=renamer)
            s = bit.style.hide_index()
            s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
                                    {'selector': 'th', 'props': [('text-align', 'left')]}])
            return s.render()

        elif kind == 'modify':
            bit = self.blog_df[['modify_date', 'title', 'tags']]
            bit = bit.sort_values(
                ['modify_date', 'title'], ascending=[False, True])
            # bit.index = range(1, len(bit)+1)
            # bit.index.name = 'post'
            bit.title = [
                f'<a href="/blog?id={h}">{t}</a>' for h, t in bit['title'].items()]
            bit = bit[['modify_date', 'title', 'tags']].rename(columns=renamer)
            s = bit.style.format(
                formatter={'Modify date': "{:%Y-%m-%d}"}).hide_index()
            s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
                                    {'selector': 'th', 'props': [('text-align', 'left')]}])
            return s.render()

        elif kind == 'tag':
            bit = self.tag_to_post_df.merge(self.blog_df[['post_date', 'title']],
                                            left_on='hash', right_index=True, how='left')
            bit = bit.sort_values(['tag', 'title'])
            # bit.index = range(1, len(bit)+1)
            # bit.index.name = 'post'
            bit.title = [
                f'<a href="/blog?id={h}">{i}</a>' for _, (h, i) in bit[['hash', 'title']].iterrows()]
            bit.tag = [f'<a href="/blog/{t}">{t}</a>' for t in bit['tag']]
            bit = bit[['tag', 'title', 'post_date']].rename(columns=renamer)
            s = bit.style.hide_index()
            s = s.set_table_styles([{'selector': 'td', 'props': [('text-align', 'left')]},
                                    {'selector': 'th', 'props': [('text-align', 'left')]}])
            return s.render()

        elif kind == 'statistics':
            bit = self.tag_to_post_df.merge(
                self.blog_df[['post_date', 'modify_date',
                              'size', 'title', 'summary', 'tag_list']],
                left_on='hash', right_index=True, how='left')
            # weird error
            gb = bit.groupby('tag').agg(number=('hash', np.size),
                                        avg_size=('size', np.mean),
                                        max_size=('size', np.max),
                                        avg_date=('modify_date', np.mean),
                                        cross_tags=('tag_list', lambda x: np.mean(
                                            [len(i) for i in x]))
                                        ).sort_values('number', ascending=False)
            gb['tag'] = [f'<a href="/blog/{i}">{i}</a>' for i in gb.index]
            gb = gb[['tag', 'avg_date', 'number', 'avg_size', 'max_size', 'cross_tags']]. \
                sort_values(['number', 'avg_date'], ascending=[False, True]). \
                rename(columns=renamer)
            # see https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
            s = gb.style.format(formatter={'Avg date': "{:%Y-%m-%d}", 'Avg size': '{:,.0f}',
                                           'Max size': '{:,.0f}', 'Num tags': '{:.1f}'}).hide_index()
            s = s.set_table_styles(
                [{'selector': 'td', 'props': [('text-align', 'right')]}])
            return s.render()
        else:
            return '<h2>Unknown report type</h2>'

[docs]    @staticmethod
    def blog_entries_to_df(p):
        """
        Convert blog entries in directory p (a Path) to a dataframe.
        Parse tags, title, etc.

        * Must start with an h1; generally that is the only h1 in the document
        * A final comment with a list of tags; if there is no tag it is tagged NOC
        * A span, usually near the top, with class description that becomes the og summary and the card summary. If
          missing then the first 150 or so (to a word break) are used.
        * An image with class og_image that becomes the og image (not too large!)

        """
        assert p.exists()

        posts = list(p.glob('*.html'))
        ans = []
        for post in posts:
            # logger.info(f'Processing {post.stem}')
            try:
                _, title, date, tags, _ = re.split(
                    r'(.*)\-\-\-(....\-..\-..)\-(.*)', post.stem)
                title = titlecase(title.replace('-', ' '))
                tags = tags.replace('-', ', ').replace('_', ' ')
                txt = post.read_text(encoding='utf-8')
                stats = post.stat()
                access, modify, create = map(lambda x: pd.to_datetime(x, unit='s', utc=True).tz_convert('US/Eastern'),
                                             [stats.st_atime, stats.st_mtime, stats.st_ctime])
                soup = BeautifulSoup(txt, "html.parser")
                # figure the description
                temp = soup('span', class_='description')
                if len(temp) > 0:
                    summary = str(temp[0])
                else:
                    # no canned description
                    extract_length = 35
                    n = 0
                    bit = []
                    for e in soup(['h2', 'h3', 'p', 'ul', 'ol', 'div', 'span']):
                        # need to get something...
                        if n == 0:
                            if len(e.text) < extract_length:
                                bit.append(e.text)
                            else:
                                temp1 = []
                                temp2 = 0
                                for w in e.text.split():
                                    if temp2 + len(w) < extract_length:
                                        temp1.append(w)
                                        temp2 += 1
                                    else:
                                        break
                                # _ = ' '.join(temp1)
                                # print('Appending', n, len(_), _)
                                bit.append(' '.join(temp1))
                                break
                        elif n + len(e.text) < extract_length:
                            bit.append(e.text)
                            n += len(e.text.split())
                        else:
                            break
                    summary = '\n'.join(bit)
                # figure an og_image
                temp = soup('img', id='og_image')
                if len(temp) > 0:
                    og_image = temp[0]['src']
                else:
                    temp = soup('img')
                    if len(temp) > 0:
                        og_image = temp[0]['src']
                    else:
                        og_image = ''
                h = hashlib.md5(post.stem.encode('utf-8')).digest().hex()
                ans.append([date, access, modify, create, stats.st_size, tags, tags.split(', '),
                            title, txt, summary, og_image, str(post), h])
            except ValueError:
                logger.error(f'{post}')

        df = pd.DataFrame(ans, columns=['post_date', 'access_date', 'modify_date', 'create_date', 'size',
                                        'tags', 'tag_list', 'title', 'html', 'summary', 'og_image', 'path', 'hash'])
        df['words'] = df.html.str.replace(
            pat=r'(?=<!--)([\s\S]*?)-->|<.*?>', repl='', flags=re.MULTILINE)
        # bag of words with no punct and lower case
        # df['bow'] = df.words.apply(lambda x: set(re.sub(r'[,.?!{}()\[\]]', '', x.lower(), flags=re.MULTILINE).split()))

        # hash based on the FILE NAME rather than the content... allows changes when entry has a fixed date
        # created above from post
        # originally
        # df.index = [hashlib.md5(t.encode('utf-8')).digest().hex() for t in df.html]
        # df.index.name = 'hash'
        df = df.set_index('hash')
        # remove duplicates (TODO: this shouldn't happen!) keep the oldest version of each post...remember the text is the same
        df = df.sort_values(['modify_date', 'title'], ascending=[False, True])
        dups = df.index.duplicated(keep='last')
        df = df.loc[~dups].copy()
        return df

[docs]    @staticmethod
    def name_to_parts(fn):
        """ fn is a Path """
        fn = fn.stem
        # date = fn[:10]
        tags, title = re.split(r'(.*?)\-\-\-(.*)', fn[11:])[1:-1]
        tags = tags.split('-')
        title = titlecase(title.replace('-', ' '))
        return tags, title

[docs]    @staticmethod
    def tags_to_tag_links(tags, glue):
        """
        Convert csv list of tags into links
        glue = ', ' or '\n' are common

        """
        ans = []
        for t in tags.split(','):
            t = t.strip()
            s = f'<a href="/blog/{t}"">{t}</a>'
            ans.append(s)
        return glue.join(ans)


[docs]class PublisherBase(object):
    """
    Container for some static functions.

    Handles workflow tracking

    """
    _macros = r"""\def\E{\mathsf{E}}
\def\Var{\mathsf{Var}}
\def\var{\mathsf{var}}
\def\SD{\mathsf{SD}}
\def\VaR{\mathsf{VaR}}
\def\CTE{\mathsf{CTE}}
\def\WCE{\mathsf{WCE}}
\def\AVaR{\mathsf{AVaR}}
\def\CVaR{\mathsf{CVaR}}
\def\TVaR{\mathsf{TVaR}}
\def\biTVaR{\mathsf{biTVaR}}
\def\ES{\mathsf{ES}}
\def\EPD{\mathsf{EPD}}
\def\cov{\mathsf{cov}}
\def\corr{\mathsf{Corr}}
\def\Pr{\mathsf{Pr}}
\def\ecirc{\accentset{\circ} e}
\def\dsum{\displaystyle\sum}
\def\dint{\displaystyle\int}
\def\AA{\mathcal{A}}
\def\bb{\bm{b}}
\def\ww{\bm{w}}
\def\xx{\bm{x}}
\def\yy{\bm{y}}
\def\HH{\bm{H}}
\def\FFF{\mathscr{F}}
\def\FF{\mathcal{F}}
\def\MM{\mathcal{M}}
\def\OO{\mathscr{O}}
\def\PPP{\mathscr{P}}
\def\PP{\mathsf{P}}
\def\QQ{\mathsf{Q}}
\def\RR{\mathbb{R}}
\def\ZZ{\mathbb{Z}}
\def\NN{\mathbb{N}}
\def\XXX{\mathcal{X}}
\def\XX{\bm{X}}
\def\ZZZ{\mathcal{Z}}
\def\bbeta{\bm{\beta}}
\def\cp{\mathsf{CP}}
\def\atan{\mathrm{atan}}
\def\ecirc{\accentset{\circ} e}
\def\tpx{{{}_tp_x}}
\def\kpx{{{}_kp_x}}
\def\tpy{{{}_tp_y}}
\def\tpxy{{{}_tp_{xy}}}
\def\tpxybar{{{}_tp_{\overline{xy}}}}
\def\tqx{{{}_tq_x}}"""

[docs]    def __init__(self):
        # path for posts
        self.web_path = BLOG_PATH
        # path for images
        self.static_img_path = STATIC_IMAGE_PATH

        # log workflow (TODO: sad that logged items are detached from where they are called)
        self._workflow = []
        self._n = 0

[docs]    def web_link(self, web_file):
        """
        Create a link of web_file relative to static_img_path
        """
        return '/' + (web_file.resolve().relative_to((self.web_path / '..').resolve())).as_posix()

[docs]    def workflow_reset(self):
        self._workflow = []
        self._n = 0

[docs]    def workflow(self, msg):
        """
        Add a message to the workflow
        :param msg:
        :return:
        """
        logger.info(msg)
        self._n += 1
        self._workflow.append(f'({self._n:02d}) {msg}')

[docs]    def workflow_show(self):
        print('\n'.join(self._workflow))

[docs]    def workflow_get(self):
        """
        Return the workflow object as an HTML comment
        """
        nl = '\n'
        return f"\n\n<!--\n{nl.join(self._workflow)}\n-->\n"

[docs]    def workflow_raw(self):
        return self._workflow

[docs]    def process_includes(self, *, txt='', fn=None):
        """
        Stand-alone process includes. txt = current status of buffer. fn = Path object source.
        If txt=None then text read from fm. If txt=='' then txt read from fn. This allows it to
        be used stand-alone.

        Not static because calls functions that access the workflow. But can be part of the base.

        :param txt:
        :param fn:
        :return: txt with includes resolved.
        """
        if txt == '':
            txt = fn.read_text(encoding='utf-8')

        if txt.find('@@@') < 0:
            return txt
        # else have work to do

        if fn is None:
            fn = Path('.')

        base_dir = fn.parent.resolve()
        n_includes = 0
        # first, substitute for all NNN specs (keep this for backwards compatibility)
        # assumes you are in the current directory
        file_map = {i[0:3]: i for i in base_dir.parent.glob("*.md")}
        txt, n_includes = self._process_includes(
            txt, base_dir, n_includes, file_map)
        self.workflow(f'IMPORT: {n_includes} files imported')
        return txt

    def _process_includes(self, txt, base_dir, n_includes, file_map):
        """
        Process @@@ include elements.
        From markdown_make.py without color_includes logic

        Iterative processing of include files
        file_map looks for nnn_something.md files in the current directory
        base_dir = directory name
        """

        includes = re.findall(
            r'@@@include ([\./]*)([0-9]{3}|[0-9A-Za-z])([^\n]+\.[a-z]+)?', txt)
        for res_ in includes:
            original_match = ''.join(res_)
            # logger.info(res_, file_map)
            # res_[1] looks for nnn type files and tries to find them in file_map
            if res_[2] == '':
                res = file_map[res_[1]]
                # logger.info(f'REPLACING {res_} with {res}')
            else:
                res = original_match
                # logger.info(f'using {"".join(res_)} as {res}')
            self.workflow(f'IMPORT: Importing {res}')
            n_includes += 1
            try:
                repl = (base_dir / res).read_text(encoding='utf-8')
                repl = self._strip_yaml(repl)
                repl, n_includes = self._process_includes(
                    repl, base_dir, n_includes, file_map)
                txt = txt.replace(f'@@@include {original_match}', repl)
            except FileNotFoundError:
                self.workflow(
                    f'IMPORT: WARNING @@@ included file {res} not found...ignoring')
        return txt, n_includes

    def _strip_yaml(self, text):
        """
        Strip starging yaml, between first --- and next --- from text.
        Applies to included files.
        From markdown_make.py.

        :param text:
        :return:
        """
        if text[:3] != '---':
            return text
        else:
            self.workflow('Stripped YAML')
            stext = text.split('\n')
            stext.pop(0)
            n = 0
            for ln in stext:
                if ln != '---':
                    n += 1
                else:
                    n += 1
                    return '\n'.join(stext[n:])

[docs]    def process_tex_macros(self, md_in, report=False):
        """
        Expand standard general.tex macros in the md_in text blog

        If ``additional_macros is not None`` then use it to update the standard list

        If ``report is True`` then just return the dictionary of macro substitutions
        """
        m, regex = PublisherBase.tex_to_dict(PublisherBase._macros)
        if report is True:
            return m, regex

        md_in, n = re.subn(regex, lambda x: m.get(
            x[0]), md_in, flags=re.MULTILINE)
        self.workflow(f'MACROS: {n} TeX macros substitutions')
        # lcroof is not handled

        return md_in

[docs]    @staticmethod
    def file_name(s):
        """
        Create a sensible random file name from a string s

        :param s:
        :return:
        """
        return Path('TMP_' + PublisherBase.string_hash(s) + '.tex')

[docs]    @staticmethod
    def string_hash(s):
        """
        Return hash of string s, as a hex string

        :param s:
        :return:
        """
        return hashlib.md5(s.encode('utf-8')).digest().hex()

[docs]    @staticmethod
    def run_command(command, flag=True):
        """
        Run a command and show results. Allows for weird xx behavior

        :param command:
        :param flag:
        :return:
        """
        with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
            line1 = p.stdout.read()
            line2 = p.stderr.read()
            exit_code = p.poll()
            if line1:
                logger.info(line1[-250:])
            if line2:
                if flag:
                    raise ValueError(line2)
                else:
                    logger.info(line2)
        return exit_code

[docs]    @staticmethod
    def tidy():
        """
        tidy up the cwd

        :return:
        """
        for pattern in ['TMP_*.*', '*.bak', '*.log', '*.aux', '*.me']:
            for f in Path('.').glob(pattern):
                logger.info(f'unlinking {f}')
                f.unlink()

[docs]    @staticmethod
    def convert_pdfs(dir_name, output_folder='', pattern='*.pdf', format='png', dpi=200, transparent=True):
        """
        Bulk conversion of all pdfs in dir_name to png. Linux (pdf2image) only. Pre-run!
        Does not adjust names in the text.

        """
        if type(dir_name) == str:
            dir_name = Path(dir_name)

        if output_folder == '':
            output_folder = dir_name

        for f in dir_name.glob(pattern):
            fo = f.stem
            logger.info(f'converting {f.name} to {fo}')
            convert_from_path(str(f), dpi=dpi, output_folder=output_folder, fmt=format, transparent=transparent,
                              output_file=fo, single_file=True)

[docs]    @staticmethod
    def tex_to_dict(text):
        """
        Convert text, a series of def{} macros into a dictionary
        returns the dictionary and the regex of all keys
        """
        smacros = text.split('\n')
        smacros = [BlogPublisher.tex_splitter(i) for i in smacros]
        m = {i: j for (i, j) in smacros}
        regex = '|'.join([re.escape(k) for k in m.keys()])
        return m, regex

[docs]    @staticmethod
    def tex_splitter(x):
        """
        x is a single def style tex macro
        """
        x = x.replace('\\def', '')
        i = x.find('{')
        return x[:i], x[i + 1:-1]

[docs]    @staticmethod
    def post_tags_and_dates(dir_path):
        """
        Read info from a set of proto posts

        :param dir_path:
        :return:
        """
        if isinstance(dir_path, Path) is False:
            dir_path = Path(dir_path)
        ans = []
        for f in dir_path.glob('*.md'):
            t = f.read_text()
            st = t.strip().split('\n')
            tags = st[-1].strip()
            tags = tags.replace("<!--", "").replace("-->", "").strip()

            date = st[-2].strip()
            if date[:10] == "<!-- date:":
                date = date[10:-4].strip()
            else:
                date = ''
            ans.append([f.name, date, tags])
        setup = pd.DataFrame(ans, columns=['file', 'date', 'tags'])
        return setup


[docs]class BlogPublisher(PublisherBase):

[docs]    def __init__(self, source_dir='.', update=False, dry_run=True, tex_engine='pdflatex'):
        """
        Manage creation of HTML blog-post files, including creating and image files and changing links in Markdown.
        Objective is to published as-is files that create TeX on the web. Adjustments: PDF images to PND/JPG/SVG
        (change the link and create the PNG) and TikZ (create SVG file, find begin{figure} find caption and change
        Markdown).

        Adds a final comment to the HTML explaining where the file came from.

        Creates a .bak file with the same name and including all the edits. These SHOULD NEVER BE EDITED!

        web_path is the destination for the created HTML files. If =='' then BLOG_PATH is used

        If ``update`` is True overwrite existing older HTML files, otherwise skip if exists.
        If ``dry_run`` is True just explain what would happen.

        Note: defaults in fail safe mode!

        The Markdown file can optionally have:

        * A final comment with a list of tags; if there is no tag it is tagged NOC
        * A span, usually near the top, with class description that becomes the og summary and the card summary. If
          missing then the first 150 or so (to a word break) are used.
        * An image with class og_image that becomes the og image (not too large!)

        These elements are used by BlogManager.

        :param source_dir: source directory for files to publish, default is cwd
        :param web_path:   website directory; files are published here
        :param update:
        :param dry_run:
        :param tex_engine: pdflatex (fast but not fonts) or lualatex (slow but change fonts)
        """
        if isinstance(source_dir, Path):
            self.source_dir = source_dir
        else:
            self.source_dir = Path(source_dir).resolve()

        self.update = update
        self.dry_run = dry_run
        self.tex_engine = tex_engine
        super().__init__()

[docs]    def publish_file(self, fn):
        """
        fn is a markdown file to post file, a str or Path object. Workflow is

        * read markdown, split, find tags (last comment), make post filename
        * check timing and existing blog post files to see if there are any updates
        * expand all @@@s (per markdown_make)
        * expand all basic tex macros
        * deal with pdf graphics (png/jpg/svg versions of pdf files must be created separately); if none
          is found, leave as pdf
        * deal with TikZ pictures and figures (after graphics because it introduces new ![] elements)
        * append workflow, including provenance of file
        * Save .bak file
        * pandoc create HTML file

        :param fn: name of file (or path to file) of markdown
        :param tex_engine: if pdflatex uses blog/format/tikz.fmt and pdflatex, giving default fonts.
          If lualatex runs without template, slower but will give the fonts.
        """

        if isinstance(fn, str) is True:
            fn = self.source_dir / fn
            assert fn.exists()

        # reset workflow counter
        self.workflow_reset()
        self.workflow(f'INPUT: file={fn.name}')

        txt = fn.read_text(encoding='utf-8')

        stxt = txt.strip().split('\n')

        # tags are in a comment in the last line
        tags = stxt[-1].strip()
        if tags[:4] != '<!--':
            self.workflow('INPUT: NO TAGS using NOC')
            tags = 'NOC'
        else:
            tags = tags.replace('<!--', '').replace('-->', '')
            tags = '-'.join([i.strip().replace(' ', '_')
                            for i in tags.split(',')])

        # second comment is a particular post date
        date = stxt[-2].strip()
        if date[:10] == "<!-- date:":
            date = date[10:-4].strip()
        else:
            date = ''

        if date != '':
            post_name = f'{fn.stem}---{date}-{tags}.html'
        else:
            post_name = f'{fn.stem}---{datetime.now():%Y-%m-%d}-{tags}.html'
        post_full_name = self.web_path / post_name
        self.workflow(f'INPUT: post={post_name}')

        # duplicates: same file name, any date, any tags
        pattern_name = f'{fn.stem}---????-??-??-*.html'
        matching_files = list(self.web_path.glob(pattern_name))

        if len(matching_files) > 0:
            if self.update is True:
                for f in matching_files:
                    if f.stat().st_mtime < fn.stat().st_mtime:
                        self.workflow(
                            f'UPDATE: Updating {f.name} an older HTML file for {fn.name}')
                    elif f.stat().st_mtime >= fn.stat().st_mtime:
                        logger.warning(
                            f'Skipping newer HTML file {f.name} for {fn.name} already exists')
                        return 0
                    else:
                        # this filename does not exist...but it was globbed
                        logger.warning(
                            f'Previous day creating today HTML file for {fn}??')
                        raise ValueError('This should be impossible')
            else:
                logger.warning(
                    f'EXITING: update==False but {len(matching_files)} matching blog HTML file(s) for {fn} exist(s)')
                return 0
            # else
                # still need to create the post...hence carry on

        # * expand all @@@s (per markdown_make)
        txt = self.process_includes(txt=txt, fn=fn)

        # now have the fully built source file
        # * expand all basic tex macros
        txt = self.process_tex_macros(txt, False)

        # * deal with other graphics (files must be created separately)
        txt = self.adjust_image_links(txt)

        # * deal with TikZ pictures and figures
        tikz = TikzManager(raw_input=txt, doc_path=fn, tex_engine=self.tex_engine)
        tikz.process_tikz()
        txt = tikz.raw_input
        self._workflow.extend(tikz._workflow)

        # make the name for the temp file
        fn = fn.with_suffix('.me')

        # pandoc command (want in the workflow)
        command = ['pandoc', '-f', 'markdown', '-t', 'html', '-o', str(post_full_name),
                   '--highlight-style=pygments', '--mathjax', '--citeproc',
                   f'--bibliography={Path.home()}/S/TELOS/biblio/library.bib', str(fn)]

        # append source information as a comment
        self.workflow(f'BIBLIO: {Path.home()}/S/TELOS/biblio/library.bib')
        self.workflow(f'PROCESS: creating temp file {fn}')
        cmd = " ".join(command)
        Path('make_last.bat').write_text(cmd, encoding='utf-8')
        self.workflow(f'PROCESS: pandoc processing with {cmd}')
        self.workflow('PROCESS: use make_last.bat to re-run final step')
        # * write provenance of file
        txt = txt + self.workflow_get()
        # workflow is closed now...!

        # * Save .bak file
        # finally, write the temp file!
        fn.write_text(txt, encoding='utf-8')

        if self.dry_run:
            # logger.info(f'{fn} --> {post_full_name.name}')
            logger.info('Dry run...not executing...existing.')
            logger.info(f'See {fn.name} file for edits and changes.')
            return 0

        # * pandoc create HTML file
        logger.info(f'Pandoc execution on {fn.name}')
        with Popen(command, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
            line1 = p.stdout.read()
            line2 = p.stderr.read()
            if line1:
                logger.info(line1)
            if line2:
                logger.error(line2)
            exit_code = p.poll()

        return exit_code

[docs]    def publish_dir(self, pattern='*.md', tidy=True):
        """
        Publish all files matching ``pattern`` to web_path


        """
        for fn in self.source_dir.glob(pattern):
            self.publish_file(fn, tidy=tidy)

[docs]    def adjust_image_links(self, txt):
        """
        Convert pdf figure links. DOES NOT MAKE the new images (that needs pdf2image (Linux)); it looks
        for linkely contenders and selects one.

        Completely separate from dealing with tikz.

        Looks in the same folder for an appropriate non-pdf version of the file: prefers SVG then PNG then JPG.

        If no file found then sets link to ``default`` format and it is up to you to create that file (noted
        in the workflow).

        Note, these file names are futher tinkered to move them to the website static folder.

        See git history for an attempt to use divsvgm -P filename conversion...but those svg files do not
        render.

        """

        # need to look for images and copy them over; _file_renamer does a lot of work
        if txt.find('![') < 0:
            self.workflow('No image links found')

        # original pattern:
        # txt = re.sub(r'(?:!\[(?:.|\n)*?\])\((.*)\)', self._file_renamer, txt)

        # find candidates - lock in since you will be changing txt
        matches = list(re.findall(
            r'(!\[((?:.|\n)*?)\]\((.+?)\))(\{.*?\})?', txt))
        for whole_match, caption, file_name, classes in matches:
            image_file = self.source_dir / file_name
            if file_name[:4] == 'http':
                # external link - not adjusted
                self.workflow(f'IMAGE: External link unadjusted: {file_name}')
                continue
            elif image_file.exists() is True and image_file.suffix != '.pdf':
                self.workflow(f'IMAGE: Non PDF link unadjusted: {file_name}')
                new_file = image_file
            elif image_file.exists() is False:
                # this is just a general problem...should not occur often
                self.workflow(
                    f'IMAGE: Image file does not exist: leaving link unchanged for {file_name}')
                continue
            else:
                # file exists and is a pdf...we find a replacement (default leave as PDF)
                # look for candidate replacement file
                for kind in ['.svg', '.png', '.jpg']:
                    new_file = image_file.with_suffix(kind)
                    if new_file.exists() and new_file.stat().st_mtime >= image_file.stat().st_mtime:
                        new_file = image_file.with_suffix(kind)
                        break
                else:
                    # did not find an alternative, but still need to copy the pdf over
                    # new_file = image_file
                    # OK, make an SVG...
                    new_file = image_file.with_suffix('.svg')
                    self.workflow(
                        f'IMAGE: Creating svg file for {image_file.name} (using new pdf2svg util)')
                    # https://github.com/jalios/pdf2svg-windows
                    command = [
                        'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg', str(image_file), str(new_file)]
                    self.run_command(command)

            # copy over new file, which by construction must exist
            # create link to new file
            web_file = (self.static_img_path / new_file.name)
            if web_file.exists():
                web_file.unlink()
            # safe rather than sorry on re-creating the link
            self.workflow(f'IMAGE: Creating link {web_file} for {file_name}')
            # Path.link_to(web_file) syntax: make web_file a hard link to this path.
            new_file.link_to(web_file)

            # link for the website, relative to the base of the blog
            link_name = self.web_link(web_file)

            # finally, have to adjust the link name and add 100% width ; classes includes the braces
            txt = txt.replace(f'({file_name}){classes}',
                              f'({link_name}){{width=100%}}')
            self.workflow(
                f'IMAGE: txt image link  ![]({file_name}) replaced with ![...]({link_name})')
            if classes == '':
                self.workflow('IMAGE:>>>class {{width=100%}} added')
            else:
                self.workflow(
                    f'IMAGE:>>>class {classes} replaced with {{width=100%}}')

        return txt


[docs]class TikzManager(PublisherBase):
    _tex_template = """\\documentclass[border=5mm]{{standalone}}

% needs lualatex - uncomment for Wiley fonts
\\usepackage{{fontspec}}
\\setmainfont{{Stix Two Text}}
\\usepackage{{unicode-math}}
\\setmathfont{{Stix Two Math}}

\\usepackage{{url}}
\\usepackage{{tikz}}
\\usepackage{{color}}
\\usetikzlibrary{{arrows,calc,positioning,shadows.blur,decorations.pathreplacing}}
\\usetikzlibrary{{automata}}
\\usetikzlibrary{{fit}}
\\usetikzlibrary{{snakes}}
\\usetikzlibrary{{intersections}}
\\usetikzlibrary{{decorations.markings,decorations.text,decorations.pathmorphing,decorations.shapes}}
\\usetikzlibrary{{decorations.fractals,decorations.footprints}}
\\usetikzlibrary{{graphs}}
\\usetikzlibrary{{matrix}}
\\usetikzlibrary{{shapes.geometric}}
\\usetikzlibrary{{mindmap, shadows}}
\\usetikzlibrary{{backgrounds}}
\\usetikzlibrary{{cd}}

% really common macros
\\newcommand{{\\grtspacer}}{{\\vphantom{{lp}}}}

\\def\\dfrac{{\\displaystyle\\frac}}
\\def\\dint{{\\displaystyle\\int}}

\\begin{{document}}

{tikz_begin}{tikz_code}{tikz_end}

\\end{{document}}
"""

[docs]    def __init__(self, *, raw_input='', doc_path=None, tex_engine='pdflatex'):
        """
        Convert tikz figures in input text (raw_input) or a file (doc_path) into stand-alone svg files,
        saved in web_path (usually the static/img folder).

        If raw_input == '' then it is read from doc_path.

        doc_path is used to determine if temp .tex files need updating.

        When called by BlogPublisher, doc_path text has already been adjusted, hence raw_input.

        When called stand-alone raw_input==''.

        """
        if type(doc_path) == str:
            self.doc_path = Path(doc_path)
        elif doc_path is not None:
            self.doc_path = doc_path

        if raw_input != '':
            self.raw_input = raw_input
        else:
            self.raw_input = doc_path.read_text(encoding='utf-8')

        if doc_path is None:
            # make a temp .tex filename
            self.doc_path = self.file_name(self.raw_input)

        self.tex_engine = tex_engine

        super().__init__()

[docs]    @staticmethod
    def split_tikz(txt):
        """
        Split text to get the tikzpicture. Format is

        initial text pip then groups of four:

        1. begin tag ``(1::4)``
        2. tikz code ``(2::4)``
        3. end tag   ``(3::4)``
        4. non-related text ``(4::4)``

        """
        return re.split(r'(\\begin{tikz(?:cd|picture)}|\\end{tikz(?:cd|picture)})', txt)

[docs]    def split_figures(self):
        return re.split(r'(\\begin{figure}|\\begin{sidewaysfigure}|\\begin{table}|'
                        r'\\end{figure}|\\end{sidewaysfigure}|\\end{table})', self.raw_input)

[docs]    def list_tikz(self):
        """
        List the figures in doc_fn
        """
        return self.split_tikz(self.raw_input)[2::4]

[docs]    def process_tikz(self):
        """
        Process the tikz figures/tables/sidewaystables in the doc into svg files.
        """
        all_containers = self.split_figures()
        begin_tags = iter(all_containers[1::4])
        outer_codes = iter(all_containers[2::4])
        end_tags = iter(all_containers[3::4])
        # next_blob = iter(all_containers[4::4])

        for i, begin_tag, outer_code, end_tag in zip(count(), begin_tags, outer_codes, end_tags):
            # find tikzpicture, tikzcd etc.
            if outer_code.find('\\begin{tikz') >= 0:
                # container contains a tikzpicture
                caption = re.search(
                    r'\\caption\{((?:.|\n)*?)\}\n', outer_code, flags=re.MULTILINE)
                if caption is None:
                    caption = ''
                else:
                    caption = caption[1]
                # adjust the original doc; will create a tex file, tex it to pdf, create svg file,
                # link the svg file into web (and keep a local copy).
                svg_path = self.doc_path.with_suffix(
                    f'.{self.string_hash(outer_code)}.{i}.svg')
                tex_path = svg_path.with_suffix('.tex')
                web_path = self.static_img_path / svg_path.name
                # this is a string link for the output doc
                web_link = self.web_link(web_path)
                if begin_tag.find('figure') > 0:
                    lbl = '*Figure:*'
                else:
                    lbl = '*Table:*'
                self.raw_input = self.raw_input.replace(
                    f'{begin_tag}{outer_code}{end_tag}',
                    f"\n\n![{lbl} {caption}]({web_link}){{width=100%}}\n\n"
                )
                # do not have to worry about existing classes - this was a figure or table...
                self.workflow(
                    f'TIKZ: replaced text for {begin_tag}...{end_tag} with ![...]({web_link})')
                # process if the svg files is older than doc_path
                # Assumes that you don't tinker with links...
                # is True and svg_path.stat().st_mtime >= self.doc_path.stat().st_mtime:
                if svg_path.exists():
                    self.workflow(
                        f'TIKZ: using existing svg file for Tikz #{i}, {svg_path.name}')
                else:
                    # make tex code for a stand-alone document
                    tikz_begin, tikz_code, tikz_end = self.split_tikz(outer_code)[
                        1:4]
                    tex_code = self._tex_template.format(
                        tikz_begin=tikz_begin, tikz_code=tikz_code, tikz_end=tikz_end)
                    tex_path.write_text(tex_code, encoding='utf-8')
                    self.workflow(
                        f'TIKZ: diagram #{i}, created temp file = {tex_path.name}')
                    pdf_file = tex_path.with_suffix('.pdf')
                    self.workflow(f'TIKZ: Update pdf file for Tikz #{i}')
                    if self.tex_engine == 'pdflatex':
                        # faster with template
                        # TODO EVID hard coded template
                        template = str(Path.home() / 'S/TELOS/Blog/format/tikz.fmt')
                        command = ['pdflatex', f'--fmt={template}', str(tex_path)]
                    else:
                        # for STIX fonts, no template
                        command = ['lualatex', str(tex_path)]
                    self.workflow(f'TIKZ: TeX Command={" ".join(command)}')
                    self.run_command(command)
                    self.workflow(
                        f'TIKZ: Creating svg file for Tikz #{i} (using new pdf2svg util)')
                    # https://github.com/jalios/pdf2svg-windows
                    command = [
                        'C:\\temp\\pdf2svg-windows\\dist-64bits\\pdf2svg', str(pdf_file), str(svg_path)]
                    # seems to return info on stderr?
                    self.run_command(command, flag=False)
                    # create a nice name version of the svg file
                    if str(web_path) != str(svg_path):
                        if web_path.exists():
                            web_path.unlink()
                        svg_path.link_to(web_path)
                        self.workflow(
                            f'TIKZ: Linking {web_path} pointing to {svg_path} for Tikz #{i}')


# command line related
[docs]def setup_parser():
    """
    Set up all command line options and return parser

    :return:  parser object
    """
    parser = argparse.ArgumentParser(
        description='BlogManager: create and manage blog posts. All posted to the default Blog website (global variable).',
        epilog='Examples: (1) python -m blog_tools -a post_file -f *.md  posts all markdown files in the current directory. '
               '(2) python -m blog_tools -d new_posts -a post_dir posts all markdown files in the directory new_posts. '
               '(3) python -m blog_tools -a convert -c *.pdf  converts all pdf files in the current directory to '
               '200 dpi PNGs (the defaults, set with --dpi and --format.'
    )
    # Debug group and general control
    parser.add_argument('-y', '--dry_run', action="store_true",
                        help='dry_run mode: nothing actually done.')

    parser.add_argument('-d', '--directory', action='store', type=str, default='',
                        metavar='SOURCE_DIRECTORY_NAME',
                        help='Source directory for files, default is cwd.')

    # parser.add_argument('-w', '--web_directory', action='store', type=str, default='',
    #                     metavar='WEBSITE_DIRECTORY_NAME',
    #                     help=f'Destination directory for posts. Defaults to {BLOG_PATH}')

    parser.add_argument('-u', '--update', action="store_true",
                        help='Update mode: only update files where md is newer than html.')

    action_list = ['post_file', 'post_dir', 'convert']
    parser.add_argument('-a', '--action', action='store', choices=action_list,
                        help='Determines the action: post a file, directory, or run pdf converter (Linux only).')

    # post related
    engine_list = ['pdflatex', 'lualatex']
    parser.add_argument('-t', '--tex', action='store', choices=engine_list, default='pdflatex',
                        help='Specify TeX engine. pdflatex = fast, no fonts; lualatex = slow with fonts.')

    parser.add_argument('-f', '--files', action='store', type=str, default='',
                        metavar='FILE_PATTERN',
                        help='Files filtered matching FILE_PATTERN. For post or convert. Can be a single filename.')

    parser.add_argument('-r', '--refresh', action="store_true",
                        help='Refresh server issuing a curl http://127.0.0.1:5000/blog/reset command.')

    # convert related
    parser.add_argument('-c', '--convert', action='store', type=str, default='*.pdf',
                        metavar='CONVERT_FILE_PATTERN',
                        help='Convert all files in current directory matching CONVERT_FILE_PATTERN to FORMAT. '
                             'Run from Linux (smve38_clean). For example, to  convert img/*.pdf '
                             'python -m blog_tools -a convert -d img --format=*.pdf. Converted files are written '
                             'to the same directory. ')

    parser.add_argument('--format', action='store', type=str, default='png',
                        metavar='FORMAT',
                        help='Set output file type FORMAT for convert.')

    parser.add_argument('--dpi', action='store', type=int, default=200,
                        metavar='DPI',
                        help='Set DPI level for convert.')

    # other examples
    # parser.add_argument('-l', '--limit', action="store", type=int, default=10,
    #                     help='Meta: download limit number of posts.')
    # parser.add_argument('-i', '--images', action="store_true",
    #                     help='Meta: download associated images.')

    return parser


[docs]def main():
    """
    handle command line operation
    needs to be a function for sphinx argparse.
    :return:
    """

    logging.basicConfig(level=logging.DEBUG)

    parser = setup_parser()
    args = parser.parse_args()

    if args.directory == '':
        source_dir = Path('.')
    else:
        source_dir = Path(args.directory)

    # make blog publisher object
    bp = BlogPublisher(source_dir, update=args.update, dry_run=args.dry_run)

    if args.action == 'post_file':
        # just those matching args.post pattern
        for f in source_dir.glob(args.files):
            bp.publish_file(f)

    elif args.action == 'post_dir':
        bp.publish_dir()

    elif args.action == 'convert':
        bp.convert_pdfs(source_dir, source_dir, pattern=args.files,
                        format=args.format, dpi=args.dpi)

    if args.refresh is True:
        command = ['curl', 'http://127.0.0.1:5000/blog/reset']
        PublisherBase.run_command(command, flag=False)

if __name__ == '__main__':
    main()