diff --git a/Makefile b/Makefile index 5d524cd..f7418cb 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,27 @@ -PROJECT=dekker_thesis +TARGETS = dekker_thesis.pdf -.PHONY: $(PROJECT).pdf clean +MAKE=make +LATEX=xelatex +BIBTEX=biber +BUILDDIR=build -$(PROJECT).pdf: $(PROJECT).tex - pipenv run latexmk --shell-escape $< +.PHONY: FORCE + +all: $(TARGETS) + +$(BUILDDIR)/.sync: + pipenv sync + +%.pdf: %.tex FORCE + pipenv run ./latexrun --latex-cmd=$(LATEX) --latex-args="-shell-escape" --bibtex-cmd=$(BIBTEX) -O $(BUILDDIR) $< + +update: + pipenv lock + rm -f $(BUILDDIR)/.sync clean: - latexmk -c + ./latexrun --clean-all + rm -f indent.log clobber: - latexmk -C rm -rf build/ diff --git a/latexrun b/latexrun new file mode 100755 index 0000000..91dbbd6 --- /dev/null +++ b/latexrun @@ -0,0 +1,2007 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2013, 2014 Austin Clements + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import sys +import os +import errno +import argparse +import shlex +import json +import subprocess +import re +import collections +import hashlib +import shutil +import curses +import filecmp +import io +import traceback +import time + +try: + import fcntl +except ImportError: + # Non-UNIX platform + fcntl = None + +def debug(string, *args): + if debug.enabled: + print(string.format(*args), file=sys.stderr) +debug.enabled = False + +def debug_exc(): + if debug.enabled: + traceback.print_exc() + +def main(): + # Parse command-line + arg_parser = argparse.ArgumentParser( + description='''A 21st century LaTeX wrapper, + %(prog)s runs latex (and bibtex) the right number of times so you + don't have to, + strips the log spew to make errors visible, + and plays well with standard build tools.''') + arg_parser.add_argument( + '-o', metavar='FILE', dest='output', default=None, + help='Output file name (default: derived from input file)') + arg_parser.add_argument( + '--latex-cmd', metavar='CMD', default='pdflatex', + help='Latex command (default: %(default)s)') + arg_parser.add_argument( + '--latex-args', metavar='ARGS', type=arg_parser_shlex, + help='Additional command-line arguments for latex.' + ' This will be parsed and split using POSIX shell rules.') + arg_parser.add_argument( + '--bibtex-cmd', metavar='CMD', default='bibtex', + help='Bibtex command (default: %(default)s)') + arg_parser.add_argument( + '--bibtex-args', metavar='ARGS', type=arg_parser_shlex, + help='Additional command-line arguments for bibtex') + arg_parser.add_argument( + '--makeglossaries-cmd', metavar='CMD', default='makeglossaries', + help='Makeglossaries command (default: %(default)s)') + arg_parser.add_argument( + '--makeglossaries-args', metavar='ARGS', type=arg_parser_shlex, + help='Additional command-line arguments for makeglossaries') + arg_parser.add_argument( + '--max-iterations', metavar='N', type=int, default=10, + help='Max number of times to run latex before giving up' + ' (default: %(default)s)') + arg_parser.add_argument( + '-W', metavar='(no-)CLASS', + action=ArgParserWarnAction, dest='nowarns', default=set(['underfull']), + help='Enable/disable warning from CLASS, which can be any package name, ' + 'LaTeX warning class (e.g., font), bad box type ' + '(underfull, overfull, loose, tight), or "all"') + arg_parser.add_argument( + '-O', metavar='DIR', dest='obj_dir', default='latex.out', + help='Directory for intermediate files and control database ' + '(default: %(default)s)') + arg_parser.add_argument( + '--color', choices=('auto', 'always', 'never'), default='auto', + help='When to colorize messages') + arg_parser.add_argument( + '--verbose-cmds', action='store_true', default=False, + help='Print commands as they are executed') + arg_parser.add_argument( + '--debug', action='store_true', + help='Enable detailed debug output') + actions = arg_parser.add_argument_group('actions') + actions.add_argument( + '--clean-all', action='store_true', help='Delete output files') + actions.add_argument( + 'file', nargs='?', help='.tex file to compile') + args = arg_parser.parse_args() + if not any([args.clean_all, args.file]): + arg_parser.error('at least one action is required') + args.latex_args = args.latex_args or [] + args.bibtex_args = args.bibtex_args or [] + args.makeglossaries_args = args.makeglossaries_args or [] + + verbose_cmd.enabled = args.verbose_cmds + debug.enabled = args.debug + + # A note about encodings: POSIX encoding is a mess; TeX encoding + # is a disaster. Our goal is to make things no worse, so we want + # byte-accurate round-tripping of TeX messages. Since TeX + # messages are *basically* text, we use strings and + # surrogateescape'ing for both input and output. I'm not fond of + # setting surrogateescape globally, but it's far easier than + # dealing with every place we pass TeX output through. + # Conveniently, JSON can round-trip surrogateescape'd strings, so + # our control database doesn't need special handling. + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, encoding=sys.stdout.encoding, + errors='surrogateescape', line_buffering=sys.stdout.line_buffering) + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, encoding=sys.stderr.encoding, + errors='surrogateescape', line_buffering=sys.stderr.line_buffering) + + Message.setup_color(args.color) + + # Open control database. + dbpath = os.path.join(args.obj_dir, '.latexrun.db') + if not os.path.exists(dbpath) and os.path.exists('.latexrun.db'): + # The control database used to live in the source directory. + # Support this for backwards compatibility. + dbpath = '.latexrun.db' + try: + db = DB(dbpath) + except (ValueError, OSError) as e: + print('error opening {}: {}'.format(e.filename if hasattr(e, 'filename') + else dbpath, e), + file=sys.stderr) + debug_exc() + sys.exit(1) + + # Clean + if args.clean_all: + try: + db.do_clean(args.obj_dir) + except OSError as e: + print(e, file=sys.stderr) + debug_exc() + sys.exit(1) + + # Build + if not args.file: + return + task_commit = None + try: + task_latex = LaTeX(db, args.file, args.latex_cmd, args.latex_args, + args.obj_dir, args.nowarns) + task_commit = LaTeXCommit(db, task_latex, args.output) + task_bibtex = BibTeX(db, task_latex, args.bibtex_cmd, args.bibtex_args, + args.nowarns, args.obj_dir) + task_glossaries = MakeGlossaries(db, task_latex, args.makeglossaries_cmd, args.makeglossaries_args, + args.nowarns, args.obj_dir) + tasks = [task_latex, task_commit, task_bibtex, task_glossaries] + stable = run_tasks(tasks, args.max_iterations) + + # Print final task output and gather exit status + status = 0 + for task in tasks: + status = max(task.report(), status) + + if not stable: + print('error: files are still changing after {} iterations; giving up' + .format(args.max_iterations), file=sys.stderr) + status = max(status, 1) + except TaskError as e: + print(str(e), file=sys.stderr) + debug_exc() + status = 1 + + # Report final status, if interesting + fstatus = 'There were errors' if task_commit is None else task_commit.status + if fstatus: + output = args.output + if output is None: + if task_latex.get_outname() is not None: + output = os.path.basename(task_latex.get_outname()) + else: + output = 'output' + if Message._color: + terminfo.send('bold', ('setaf', 1)) + print('{}; {} not updated'.format(fstatus, output)) + if Message._color: + terminfo.send('sgr0') + sys.exit(status) + +def arg_parser_shlex(string): + """Argument parser for shell token lists.""" + try: + return shlex.split(string) + except ValueError as e: + raise argparse.ArgumentTypeError(str(e)) from None + +class ArgParserWarnAction(argparse.Action): + def __call__(self, parser, namespace, value, option_string=None): + nowarn = getattr(namespace, self.dest) + if value == 'all': + nowarn.clear() + elif value.startswith('no-'): + nowarn.add(value[3:]) + else: + nowarn.discard(value) + setattr(namespace, self.dest, nowarn) + +def verbose_cmd(args, cwd=None, env=None): + if verbose_cmd.enabled: + cmd = ' '.join(map(shlex.quote, args)) + if cwd is not None: + cmd = '(cd {} && {})'.format(shlex.quote(cwd), cmd) + if env is not None: + for k, v in env.items(): + if os.environ.get(k) != v: + cmd = '{}={} {}'.format(k, shlex.quote(v), cmd) + print(cmd, file=sys.stderr) +verbose_cmd.enabled = False + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise + +class DB: + """A latexrun control database.""" + + _VERSION = 'latexrun-db-v2' + + def __init__(self, filename): + self.__filename = filename + + # Make sure database directory exists + if os.path.dirname(self.__filename): + os.makedirs(os.path.dirname(self.__filename), exist_ok=True) + + # Lock the database if possible. We don't release this lock + # until the process exits. + lockpath = self.__filename + '.lock' + if fcntl is not None: + lockfd = os.open(lockpath, os.O_CREAT|os.O_WRONLY|os.O_CLOEXEC, 0o666) + # Note that this is actually an fcntl lock, not a lockf + # lock. Don't be fooled. + fcntl.lockf(lockfd, fcntl.LOCK_EX, 1) + + try: + fp = open(filename, 'r') + except FileNotFoundError: + debug('creating new database') + self.__val = {'version': DB._VERSION} + else: + debug('loading database') + self.__val = json.load(fp) + if 'version' not in self.__val: + raise ValueError('file exists, but does not appear to be a latexrun database'.format(filename)) + if self.__val['version'] != DB._VERSION: + raise ValueError('unknown database version {!r}' + .format(self.__val['version'])) + + def commit(self): + debug('committing database') + # Atomically commit database + tmp_filename = self.__filename + '.tmp' + with open(tmp_filename, 'w') as fp: + json.dump(self.__val, fp, indent=2, separators=(',', ': ')) + fp.flush() + os.fsync(fp.fileno()) + os.rename(tmp_filename, self.__filename) + + def get_summary(self, task_id): + """Return the recorded summary for the given task or None.""" + return self.__val.get('tasks', {}).get(task_id) + + def set_summary(self, task_id, summary): + """Set the summary for the given task.""" + self.__val.setdefault('tasks', {})[task_id] = summary + + def add_clean(self, filename): + """Add an output file to be cleaned. + + Unlike the output files recorded in the task summaries, + cleanable files strictly accumulate until a clean is + performed. + """ + self.__val.setdefault('clean', {})[filename] = hash_cache.get(filename) + + def do_clean(self, obj_dir=None): + """Remove output files and delete database. + + If obj_dir is not None and it is empty after all files are + removed, it will also be removed. + """ + + for f, want_hash in self.__val.get('clean', {}).items(): + have_hash = hash_cache.get(f) + if have_hash is not None: + if want_hash == have_hash: + debug('unlinking {}', f) + hash_cache.invalidate(f) + os.unlink(f) + else: + print('warning: {} has changed; not removing'.format(f), + file=sys.stderr) + self.__val = {'version': DB._VERSION} + try: + os.unlink(self.__filename) + except FileNotFoundError: + pass + if obj_dir is not None: + try: + os.rmdir(obj_dir) + except OSError: + pass + +class HashCache: + """Cache of file hashes. + + As latexrun reaches fixed-point, it hashes the same files over and + over, many of which never change. Since hashing is somewhat + expensive, we keep a simple cache of these hashes. + """ + + def __init__(self): + self.__cache = {} + + def get(self, filename): + """Return the hash of filename, or * if it was clobbered.""" + try: + with open(filename, 'rb') as fp: + st = os.fstat(fp.fileno()) + key = (st.st_dev, st.st_ino) + if key in self.__cache: + return self.__cache[key] + + debug('hashing {}', filename) + h = hashlib.sha256() + while True: + block = fp.read(256*1024) + if not len(block): + break + h.update(block) + self.__cache[key] = h.hexdigest() + return self.__cache[key] + except (FileNotFoundError, IsADirectoryError): + return None + + def clobber(self, filename): + """If filename's hash is not known, record an invalid hash. + + This can be used when filename was overwritten before we were + necessarily able to obtain its hash. filename must exist. + """ + st = os.stat(filename) + key = (st.st_dev, st.st_ino) + if key not in self.__cache: + self.__cache[key] = '*' + + def invalidate(self, filename): + try: + st = os.stat(filename) + except OSError as e: + # Pessimistically wipe the whole cache + debug('wiping hash cache ({})', e) + self.__cache.clear() + else: + key = (st.st_dev, st.st_ino) + if key in self.__cache: + del self.__cache[key] +hash_cache = HashCache() + +class _Terminfo: + def __init__(self): + self.__tty = os.isatty(sys.stdout.fileno()) + if self.__tty: + curses.setupterm() + self.__ti = {} + + def __ensure(self, cap): + if cap not in self.__ti: + if not self.__tty: + string = None + else: + string = curses.tigetstr(cap) + if string is None or b'$<' in string: + # Don't have this capability or it has a pause + string = None + self.__ti[cap] = string + return self.__ti[cap] + + def has(self, *caps): + return all(self.__ensure(cap) is not None for cap in caps) + + def send(self, *caps): + # Flush TextIOWrapper to the binary IO buffer + sys.stdout.flush() + for cap in caps: + # We should use curses.putp here, but it's broken in + # Python3 because it writes directly to C's buffered + # stdout and there's no way to flush that. + if isinstance(cap, tuple): + s = curses.tparm(self.__ensure(cap[0]), *cap[1:]) + else: + s = self.__ensure(cap) + sys.stdout.buffer.write(s) +terminfo = _Terminfo() + +class Progress: + _enabled = None + + def __init__(self, prefix): + self.__prefix = prefix + if Progress._enabled is None: + Progress._enabled = (not debug.enabled) and \ + terminfo.has('cr', 'el', 'rmam', 'smam') + + def __enter__(self): + self.last = '' + self.update('') + return self + + def __exit__(self, typ, value, traceback): + if Progress._enabled: + # Beginning of line and clear + terminfo.send('cr', 'el') + sys.stdout.flush() + + def update(self, msg): + if not Progress._enabled: + return + out = '[' + self.__prefix + ']' + if msg: + out += ' ' + msg + if out != self.last: + # Beginning of line, clear line, disable wrap + terminfo.send('cr', 'el', 'rmam') + sys.stdout.write(out) + # Enable wrap + terminfo.send('smam') + self.last = out + sys.stdout.flush() + +class Message(collections.namedtuple( + 'Message', 'typ filename lineno msg')): + def emit(self): + if self.filename: + if self.filename.startswith('./'): + finfo = self.filename[2:] + else: + finfo = self.filename + else: + finfo = '' + if self.lineno is not None: + finfo += ':' + str(self.lineno) + finfo += ': ' + if self._color: + terminfo.send('bold') + sys.stdout.write(finfo) + + if self.typ != 'info': + if self._color: + terminfo.send(('setaf', 5 if self.typ == 'warning' else 1)) + sys.stdout.write(self.typ + ': ') + if self._color: + terminfo.send('sgr0') + sys.stdout.write(self.msg + '\n') + + @classmethod + def setup_color(cls, state): + if state == 'never': + cls._color = False + elif state == 'always': + cls._color = True + elif state == 'auto': + cls._color = terminfo.has('setaf', 'bold', 'sgr0') + else: + raise ValueError('Illegal color state {:r}'.format(state)) + + +################################################################## +# Task framework +# + +terminate_task_loop = False +start_time = time.time() + +def run_tasks(tasks, max_iterations): + """Execute tasks in round-robin order until all are stable. + + This will also exit if terminate_task_loop is true. Tasks may use + this to terminate after a fatal error (even if that fatal error + doesn't necessarily indicate stability; as long as re-running the + task will never eliminate the fatal error). + + Return True if fixed-point is reached or terminate_task_loop is + set within max_iterations iterations. + """ + + global terminate_task_loop + terminate_task_loop = False + + nstable = 0 + for iteration in range(max_iterations): + for task in tasks: + if task.stable(): + nstable += 1 + if nstable == len(tasks): + debug('fixed-point reached') + return True + else: + task.run() + nstable = 0 + if terminate_task_loop: + debug('terminate_task_loop set') + return True + debug('fixed-point not reached') + return False + +class TaskError(Exception): + pass + +class Task: + """A deterministic computation whose inputs and outputs can be captured.""" + + def __init__(self, db, task_id): + self.__db = db + self.__task_id = task_id + + def __debug(self, string, *args): + if debug.enabled: + debug('task {}: {}', self.__task_id, string.format(*args)) + + def stable(self): + """Return True if running this task will not affect system state. + + Functionally, let f be the task, and s be the system state. + Then s' = f(s). If it must be that s' == s (that is, f has + reached a fixed point), then this function must return True. + """ + last_summary = self.__db.get_summary(self.__task_id) + if last_summary is None: + # Task has never run, so running it will modify system + # state + changed = 'never run' + else: + # If any of the inputs have changed since the last run of + # this task, the result may change, so re-run the task. + # Also, it's possible something else changed an output + # file, in which case we also want to re-run the task, so + # check the outputs, too. + changed = self.__summary_changed(last_summary) + + if changed: + self.__debug('unstable (changed: {})', changed) + return False + else: + self.__debug('stable') + return True + + def __summary_changed(self, summary): + """Test if any inputs changed from summary. + + Returns a string describing the changed input, or None. + """ + for dep in summary['deps']: + fn, args, val = dep + method = getattr(self, '_input_' + fn, None) + if method is None: + return 'unknown dependency method {}'.format(fn) + if method == self._input_unstable or method(*args) != val: + return '{}{}'.format(fn, tuple(args)) + return None + + def _input(self, name, *args): + """Register an input for this run. + + This calls self._input_(*args) to get the value of this + input. This function should run quickly and return some + projection of system state that affects the result of this + computation. + + Both args and the return value must be JSON serializable. + """ + method = getattr(self, '_input_' + name) + val = method(*args) + if [name, args, val] not in self.__deps: + self.__deps.append([name, args, val]) + return val + + def run(self): + # Before we run the task, pre-hash any files that were output + # files in the last run. These may be input by this run and + # then clobbered, at which point it will be too late to get an + # input hash. Ideally we would only hash files that were + # *both* input and output files, but latex doesn't tell us + # about input files that didn't exist, so if we start from a + # clean slate, we often require an extra run because we don't + # know a file is input/output until after the second run. + last_summary = self.__db.get_summary(self.__task_id) + if last_summary is not None: + for io_filename in last_summary['output_files']: + self.__debug('pre-hashing {}', io_filename) + hash_cache.get(io_filename) + + # Run the task + self.__debug('running') + self.__deps = [] + result = self._execute() + + # Clear cached output file hashes + for filename in result.output_filenames: + hash_cache.invalidate(filename) + + # If the output files change, then the computation needs to be + # re-run, so record them as inputs + for filename in result.output_filenames: + self._input('file', filename) + + # Update task summary in database + self.__db.set_summary(self.__task_id, + self.__make_summary(self.__deps, result)) + del self.__deps + + # Add output files to be cleaned + for f in result.output_filenames: + self.__db.add_clean(f) + + try: + self.__db.commit() + except OSError as e: + raise TaskError('error committing control database {}: {}'.format( + getattr(e, 'filename', ''), e)) from e + + def __make_summary(self, deps, run_result): + """Construct a new task summary.""" + return { + 'deps': deps, + 'output_files': {f: hash_cache.get(f) + for f in run_result.output_filenames}, + 'extra': run_result.extra, + } + + def _execute(self): + """Abstract: Execute this task. + + Subclasses should implement this method to execute this task. + This method must return a RunResult giving the inputs that + were used by the task and the outputs it produced. + """ + raise NotImplementedError('Task._execute is abstract') + + def _get_result_extra(self): + """Return the 'extra' result from the previous run, or None.""" + summary = self.__db.get_summary(self.__task_id) + if summary is None: + return None + return summary['extra'] + + def report(self): + """Report the task's results to stdout and return exit status. + + This may be called when the task has never executed. + Subclasses should override this. The default implementation + reports nothing and returns 0. + """ + return 0 + + # Standard input functions + + def _input_env(self, var): + return os.environ.get(var) + + def _input_file(self, path): + return hash_cache.get(path) + + def _input_unstable(self): + """Mark this run as unstable, regardless of other inputs.""" + return None + + def _input_unknown_input(self): + """An unknown input that may change after latexrun exits. + + This conservatively marks some unknown input that definitely + won't change while latexrun is running, but may change before + the user next runs latexrun. This allows the task to + stabilize during this invocation, but will cause the task to + re-run on the next invocation. + """ + return start_time + +class RunResult(collections.namedtuple( + 'RunResult', 'output_filenames extra')): + """The result of a single task execution. + + This captures all files written by the task, and task-specific + results that need to be persisted between runs (for example, to + enable reporting of a task's results). + """ + pass + +################################################################## +# LaTeX task +# + +def normalize_input_path(path): + # Resolve the directory of the input path, but leave the file + # component alone because it affects TeX's behavior. + head, tail = os.path.split(path) + npath = os.path.join(os.path.realpath(head), tail) + return os.path.relpath(path) + +class LaTeX(Task): + def __init__(self, db, tex_filename, cmd, cmd_args, obj_dir, nowarns): + super().__init__(db, 'latex::' + normalize_input_path(tex_filename)) + self.__tex_filename = tex_filename + self.__cmd = cmd + self.__cmd_args = cmd_args + self.__obj_dir = obj_dir + self.__nowarns = nowarns + + self.__pass = 0 + + def _input_args(self): + # If filename starts with a character the tex command-line + # treats specially, then tweak it so it doesn't. + filename = self.__tex_filename + if filename.startswith(('-', '&', '\\')): + filename = './' + filename + # XXX Put these at the beginning in case the provided + # arguments are malformed. Might want to do a best-effort + # check for incompatible user-provided arguments (note: + # arguments can be given with one or two dashes and those with + # values can use an equals or a space). + return [self.__cmd] + self.__cmd_args + \ + ['-interaction', 'nonstopmode', '-recorder', + '-output-directory', self.__obj_dir, filename] + + def _execute(self): + # Run latex + self.__pass += 1 + args = self._input('args') + debug('running {}', args) + try: + os.makedirs(self.__obj_dir, exist_ok=True) + except OSError as e: + raise TaskError('failed to create %s: ' % self.__obj_dir + str(e)) \ + from e + try: + verbose_cmd(args) + p = subprocess.Popen(args, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + stdout, has_errors, missing_includes = self.__feed_terminal(p.stdout) + status = p.wait() + except OSError as e: + raise TaskError('failed to execute latex task: ' + str(e)) from e + + # Register environment variable inputs + for env_var in ['TEXMFOUTPUT', 'TEXINPUTS', 'TEXFORMATS', 'TEXPOOL', + 'TFMFONTS', 'PATH']: + self._input('env', env_var) + + jobname, outname = self.__parse_jobname(stdout) + inputs, outputs = self.__parse_recorder(jobname) + + # LaTeX overwrites its own inputs. Mark its output files as + # clobbered before we hash its input files. + for path in outputs: + # In some abort cases (e.g., >=100 errors), LaTeX claims + # output files that don't actually exist. + if os.path.exists(path): + hash_cache.clobber(path) + # Depend on input files. Task.run pre-hashed outputs from the + # previous run, so if this isn't the first run and as long as + # the set of outputs didn't change, we'll be able to get the + # input hashes, even if they were clobbered. + for path in inputs: + self._input('file', path) + + if missing_includes: + # Missing \includes are tricky. Ideally we'd depend on + # the absence of some file, but in fact we'd have to + # depend on the failure of a whole kpathsea lookup. + # Rather than try to be clever, just mark this as an + # unknown input so we'll run at least once on the next + # invocation. + self._input('unknown_input') + + if not self.__create_outdirs(stdout) and has_errors: + # LaTeX reported unrecoverable errors (other than output + # directory errors, which we just fixed). We could + # continue to stabilize the document, which may change + # some of the other problems reported (but not the + # unrecoverable errors), or we can just abort now and get + # back to the user quickly with the major errors. We opt + # for the latter. + global terminate_task_loop + terminate_task_loop = True + # This error could depend on something we failed to track. + # It would be really confusing if we continued to report + # the error after the user fixed it, so be conservative + # and force a re-run next time. + self._input('unknown_input') + + return RunResult(outputs, + {'jobname': jobname, 'outname': outname, + 'status': status}) + + def __feed_terminal(self, stdout): + prefix = 'latex' + if self.__pass > 1: + prefix += ' ({})'.format(self.__pass) + with Progress(prefix) as progress: + buf = [] + filt = LaTeXFilter() + while True: + # Use os.read to read only what's available on the pipe, + # without waiting to fill a buffer + data = os.read(stdout.fileno(), 4096) + if not data: + break + # See "A note about encoding" above + data = data.decode('ascii', errors='surrogateescape') + buf.append(data) + filt.feed(data) + file_stack = filt.get_file_stack() + if file_stack: + tos = file_stack[-1] + if tos.startswith('./'): + tos = tos[2:] + progress.update('>' * len(file_stack) + ' ' + tos) + else: + progress.update('') + + # Were there unrecoverable errors? + has_errors = any(msg.typ == 'error' for msg in filt.get_messages()) + + return ''.join(buf), has_errors, filt.has_missing_includes() + + def __parse_jobname(self, stdout): + """Extract the job name and output name from latex's output. + + We get these from latex because they depend on complicated + file name parsing rules, are affected by arguments like + -output-directory, and may be just "texput" if things fail + really early. The output name may be None if there were no + pages of output. + """ + jobname = outname = None + for m in re.finditer(r'^Transcript written on "?(.*)\.log"?\.$', stdout, + re.MULTILINE | re.DOTALL): + jobname = m.group(1).replace('\n', '') + if jobname is None: + print(stdout, file=sys.stderr) + raise TaskError('failed to extract job name from latex log') + for m in re.finditer(r'^Output written on "?(.*\.[^ ."]+)"? \([0-9]+ page', + stdout, re.MULTILINE | re.DOTALL): + outname = m.group(1).replace('\n', '') + if outname is None and not \ + re.search(r'^No pages of output\.$|^! Emergency stop\.$' + r'|^! ==> Fatal error occurred, no output PDF file produced!$', + stdout, re.MULTILINE): + print(stdout, file=sys.stderr) + raise TaskError('failed to extract output name from latex log') + + # LuaTeX (0.76.0) doesn't include the output directory in the + # logged transcript or output file name. + if os.path.basename(jobname) == jobname and \ + os.path.exists(os.path.join(self.__obj_dir, jobname + '.log')): + jobname = os.path.join(self.__obj_dir, jobname) + if outname is not None: + outname = os.path.join(self.__obj_dir, outname) + + return jobname, outname + + def __parse_recorder(self, jobname): + """Parse file recorder output.""" + # XXX If latex fails because a file isn't found, that doesn't + # go into the .fls file, but creating that file will affect + # the computation, so it should be included as an input. + # Though it's generally true that files can be added earlier + # in search paths and will affect the output without us knowing. + # + # XXX This is a serious problem for bibtex, since the first + # run won't depend on the .bbl file! But maybe the .aux file + # will always cause a re-run, at which point the .bbl will + # exist? + filename = jobname + '.fls' + try: + recorder = open(filename) + except OSError as e: + raise TaskError('failed to open file recorder output: ' + str(e)) \ + from e + pwd, inputs, outputs = '', set(), set() + for linenum, line in enumerate(recorder): + parts = line.rstrip('\n').split(' ', 1) + if parts[0] == 'PWD': + pwd = parts[1] + elif parts[0] in ('INPUT', 'OUTPUT'): + if parts[1].startswith('/'): + path = parts[1] + else: + # Try to make "nice" paths, especially for clean + path = os.path.relpath(os.path.join(pwd, parts[1])) + if parts[0] == 'INPUT': + inputs.add(path) + else: + outputs.add(path) + else: + raise TaskError('syntax error on line {} of {}' + .format(linenum, filename)) + # Ironically, latex omits the .fls file itself + outputs.add(filename) + return inputs, outputs + + def __create_outdirs(self, stdout): + # In some cases, such as \include'ing a file from a + # subdirectory, TeX will attempt to create files in + # subdirectories of the output directory that don't exist. + # Detect this, create the output directory, and re-run. + m = re.search('^! I can\'t write on file `(.*)\'\\.$', stdout, re.M) + if m and m.group(1).find('/') > 0 and '../' not in m.group(1): + debug('considering creating output sub-directory for {}'. + format(m.group(1))) + subdir = os.path.dirname(m.group(1)) + newdir = os.path.join(self.__obj_dir, subdir) + if os.path.isdir(subdir) and not os.path.isdir(newdir): + debug('creating output subdirectory {}'.format(newdir)) + try: + mkdir_p(newdir) + except OSError as e: + raise TaskError('failed to create output subdirectory: ' + + str(e)) from e + self._input('unstable') + return True + + def report(self): + extra = self._get_result_extra() + if extra is None: + return 0 + + # Parse the log + logfile = open(extra['jobname'] + '.log', 'rt', errors='surrogateescape') + for msg in self.__clean_messages( + LaTeXFilter(self.__nowarns).feed( + logfile.read(), True).get_messages()): + msg.emit() + + # Return LaTeX's exit status + return extra['status'] + + def __clean_messages(self, msgs): + """Make some standard log messages more user-friendly.""" + have_undefined_reference = False + for msg in msgs: + if msg.msg == '==> Fatal error occurred, no output PDF file produced!': + msg = msg._replace(typ='info', + msg='Fatal error (no output file produced)') + if msg.msg.startswith('[LaTeX] '): + # Strip unnecessary package name + msg = msg._replace(msg=msg.msg.split(' ', 1)[1]) + if re.match(r'Reference .* undefined', msg.msg): + have_undefined_reference = True + if have_undefined_reference and \ + re.match(r'There were undefined references', msg.msg): + # LaTeX prints this at the end so the user knows it's + # worthwhile looking back at the log. Since latexrun + # makes the earlier messages obvious, this is + # redundant. + continue + yield msg + + def get_tex_filename(self): + return self.__tex_filename + + def get_jobname(self): + extra = self._get_result_extra() + if extra is None: + return None + return extra['jobname'] + + def get_outname(self): + extra = self._get_result_extra() + if extra is None: + return None + return extra['outname'] + + def get_status(self): + extra = self._get_result_extra() + if extra is None: + return None + return extra['status'] + +class MakeGlossaries(Task): + def __init__(self, db, latex_task, cmd, cmd_args, nowarns, obj_dir): + super().__init__(db, 'make_glossaries::' + normalize_input_path( + latex_task.get_tex_filename())) + self.__latex_task = latex_task + self.__cmd = cmd + self.__cmd_args = cmd_args + self.__obj_dir = obj_dir + + def _input_cwd(self): + return os.path.dirname(self.__latex_task.get_jobname()) + + def _input_args(self): + aux_name = os.path.basename(self.__latex_task.get_jobname()) + return [self.__cmd] + self.__cmd_args + [aux_name] + + def __feed_terminal(self, stdout): + with Progress('bibtex') as progress: + buf, linebuf = [], '' + while True: + data = os.read(stdout.fileno(), 4096) + if not data: + break + # See "A note about encoding" above + data = data.decode('ascii', errors='surrogateescape') + buf.append(data) + linebuf += data + while '\n' in linebuf: + line, _, linebuf = linebuf.partition('\n') + if line.startswith('Database file'): + progress.update(line.split(': ', 1)[1]) + return ''.join(buf) + + def __parse_inputs(self, log, cwd, env): + for line in log.splitlines(): + print(line) + + def _execute(self): + self.status = '' + + env = os.environ.copy() + args, cwd = self._input('args'), self._input('cwd') + debug('running {} in {}', args, cwd) + try: + verbose_cmd(args, cwd, env) + p = subprocess.Popen(args, cwd=cwd, env=env, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + stdout = self.__feed_terminal(p.stdout) + status = p.wait() + except OSError as e: + raise TaskError('failed to execute makeglossaries task: ' + str(e)) from e + + self.__parse_inputs(stdout, cwd, env) + outbase = self.__latex_task.get_jobname() + inputs = [outbase + ".glo"] + outputs = ['.ist', '.aux', '.gls', '.glg'] + return RunResult(outputs, {'outbase': outbase, 'status': status, + 'inputs': inputs}) + + +class LaTeXCommit(Task): + def __init__(self, db, latex_task, output_path): + super().__init__(db, 'latex_commit::' + + normalize_input_path(latex_task.get_tex_filename())) + self.__latex_task = latex_task + self.__output_path = output_path + self.status = 'There were errors' + + def _input_latex(self): + return self.__latex_task.get_status(), self.__latex_task.get_outname() + + def _execute(self): + self.status = 'There were errors' + + # If latex succeeded with output, atomically commit the output + status, outname = self._input('latex') + if status != 0 or outname is None: + debug('not committing (status {}, outname {})', status, outname) + if outname is None: + self.status = 'No pages of output' + return RunResult([], None) + + commit = self.__output_path or os.path.basename(outname) + if os.path.abspath(commit) == os.path.abspath(outname): + debug('skipping commit (outname is commit name)') + self.status = None + return RunResult([], None) + + try: + if os.path.exists(commit) and filecmp.cmp(outname, commit): + debug('skipping commit ({} and {} are identical)', + outname, commit) + # To avoid confusion, touch the output file + open(outname, 'r+b').close() + else: + debug('commiting {} to {}', outname, commit) + shutil.copy(outname, outname + '~') + os.rename(outname + '~', commit) + except OSError as e: + raise TaskError('error committing latex output: {}'.format(e)) from e + self._input('file', outname) + self.status = None + return RunResult([commit], None) + +class LaTeXFilter: + TRACE = False # Set to enable detailed parse tracing + + def __init__(self, nowarns=[]): + self.__data = '' + self.__restart_pos = 0 + self.__restart_file_stack = [] + self.__restart_messages_len = 0 + self.__messages = [] + self.__first_file = None + self.__fatal_error = False + self.__missing_includes = False + self.__pageno = 1 + self.__restart_pageno = 1 + + self.__suppress = {cls: 0 for cls in nowarns} + + def feed(self, data, eof=False): + """Feed LaTeX log data to the parser. + + The log data can be from LaTeX's standard output, or from the + log file. If there will be no more data, set eof to True. + """ + + self.__data += data + self.__data_complete = eof + + # Reset to last known-good restart point + self.__pos = self.__restart_pos + self.__file_stack = self.__restart_file_stack.copy() + self.__messages = self.__messages[:self.__restart_messages_len] + self.__lstart = self.__lend = -1 + self.__pageno = self.__restart_pageno + + # Parse forward + while self.__pos < len(self.__data): + self.__noise() + + # Handle suppressed warnings + if eof: + msgs = ['%d %s warning%s' % (count, cls, "s" if count > 1 else "") + for cls, count in self.__suppress.items() if count] + if msgs: + self.__message('info', None, + '%s not shown (use -Wall to show them)' % + ', '.join(msgs), filename=self.__first_file) + + if eof and len(self.__file_stack) and not self.__fatal_error: + # Fatal errors generally cause TeX to "succumb" without + # closing the file stack, so don't complain in that case. + self.__message('warning', None, + "unbalanced `(' in log; file names may be wrong") + return self + + def get_messages(self): + """Return a list of warning and error Messages.""" + return self.__messages + + def get_file_stack(self): + """Return the file stack for the data that has been parsed. + + This results a list from outermost file to innermost file. + The list may be empty. + """ + + return self.__file_stack + + def has_missing_includes(self): + """Return True if the log reported missing \\include files.""" + return self.__missing_includes + + def __save_restart_point(self): + """Save the current state as a known-good restart point. + + On the next call to feed, the parser will reset to this point. + """ + self.__restart_pos = self.__pos + self.__restart_file_stack = self.__file_stack.copy() + self.__restart_messages_len = len(self.__messages) + self.__restart_pageno = self.__pageno + + def __message(self, typ, lineno, msg, cls=None, filename=None): + if cls is not None and cls in self.__suppress: + self.__suppress[cls] += 1 + return + filename = filename or (self.__file_stack[-1] if self.__file_stack + else self.__first_file) + self.__messages.append(Message(typ, filename, lineno, msg)) + + def __ensure_line(self): + """Update lstart and lend.""" + if self.__lstart <= self.__pos < self.__lend: + return + self.__lstart = self.__data.rfind('\n', 0, self.__pos) + 1 + self.__lend = self.__data.find('\n', self.__pos) + 1 + if self.__lend == 0: + self.__lend = len(self.__data) + + @property + def __col(self): + """The 0-based column number of __pos.""" + self.__ensure_line() + return self.__pos - self.__lstart + + @property + def __avail(self): + return self.__pos < len(self.__data) + + def __lookingat(self, needle): + return self.__data.startswith(needle, self.__pos) + + def __lookingatre(self, regexp, flags=0): + return re.compile(regexp, flags=flags).match(self.__data, self.__pos) + + def __skip_line(self): + self.__ensure_line() + self.__pos = self.__lend + + def __consume_line(self, unwrap=False): + self.__ensure_line() + data = self.__data[self.__pos:self.__lend] + self.__pos = self.__lend + if unwrap: + # TeX helpfully wraps all terminal output at 79 columns + # (max_print_line). If requested, unwrap it. There's + # simply no way to do this perfectly, since there could be + # a line that happens to be 79 columns. + # + # We check for >=80 because a bug in LuaTeX causes it to + # wrap at 80 columns instead of 79 (LuaTeX #900). + while self.__lend - self.__lstart >= 80: + if self.TRACE: print('<{}> wrapping'.format(self.__pos)) + self.__ensure_line() + data = data[:-1] + self.__data[self.__pos:self.__lend] + self.__pos = self.__lend + return data + + # Parser productions + + def __noise(self): + # Most of TeX's output is line noise that combines error + # messages, warnings, file names, user errors and warnings, + # and echos of token lists and other input. This attempts to + # tease these apart, paying particular attention to all of the + # places where TeX echos input so that parens in the input do + # not confuse the file name scanner. There are three + # functions in TeX that echo input: show_token_list (used by + # runaway and show_context, which is used by print_err), + # short_display (used by overfull/etc h/vbox), and show_print + # (used in issue_message and the same places as + # show_token_list). + lookingat, lookingatre = self.__lookingat, self.__lookingatre + if self.__col == 0: + # The following messages are always preceded by a newline + if lookingat('! '): + return self.__errmessage() + if lookingat('!pdfTeX error: '): + return self.__pdftex_fail() + if lookingat('Runaway '): + return self.__runaway() + if lookingatre(r'(Overfull|Underfull|Loose|Tight) \\[hv]box \('): + return self.__bad_box() + if lookingatre('(Package |Class |LaTeX |pdfTeX )?(\w+ )?warning: ', re.I): + return self.__generic_warning() + if lookingatre('No file .*\\.tex\\.$', re.M): + # This happens with \includes of missing files. For + # whatever reason, LaTeX doesn't consider this even + # worth a warning, but I do! + self.__message('warning', None, + self.__simplify_message( + self.__consume_line(unwrap=True).strip())) + self.__missing_includes = True + return + # Other things that are common and irrelevant + if lookingatre(r'(Package|Class|LaTeX) (\w+ )?info: ', re.I): + return self.__generic_info() + if lookingatre(r'(Document Class|File|Package): '): + # Output from "\ProvidesX" + return self.__consume_line(unwrap=True) + if lookingatre(r'\\\w+=\\[a-z]+\d+\n'): + # Output from "\new{count,dimen,skip,...}" + return self.__consume_line(unwrap=True) + + # print(self.__data[self.__lstart:self.__lend].rstrip()) + # self.__pos = self.__lend + # return + + # Now that we've substantially reduced the spew and hopefully + # eliminated all input echoing, we're left with the file name + # stack, page outs, and random other messages from both TeX + # and various packages. We'll assume at this point that all + # parentheses belong to the file name stack or, if they're in + # random other messages, they're at least balanced and nothing + # interesting happens between them. For page outs, ship_out + # prints a space if not at the beginning of a line, then a + # "[", then the page number being shipped out (this is + # usually, but not always, followed by "]"). + m = re.compile(r'[(){}\n]|(?<=[\n ])\[\d+', re.M).\ + search(self.__data, self.__pos) + if m is None: + self.__pos = len(self.__data) + return + self.__pos = m.start() + 1 + ch = self.__data[m.start()] + if ch == '\n': + # Save this as a known-good restart point for incremental + # parsing, since we definitely didn't match any of the + # known message types above. + self.__save_restart_point() + elif ch == '[': + # This is printed at the end of a page, so we're beginning + # page n+1. + self.__pageno = int(self.__lookingatre(r'\d+').group(0)) + 1 + elif ((self.__data.startswith('`', m.start() - 1) or + self.__data.startswith('`\\', m.start() - 2)) and + self.__data.startswith('\'', m.start() + 1)): + # (, ), {, and } sometimes appear in TeX's error + # descriptions, but they're always in `'s (and sometimes + # backslashed) + return + elif ch == '(': + # XXX Check that the stack doesn't drop to empty and then re-grow + first = self.__first_file is None and self.__col == 1 + filename = self.__filename() + self.__file_stack.append(filename) + if first: + self.__first_file = filename + if self.TRACE: + print('<{}>{}enter {}'.format( + m.start(), ' '*len(self.__file_stack), filename)) + elif ch == ')': + if len(self.__file_stack): + if self.TRACE: + print('<{}>{}exit {}'.format( + m.start(), ' '*len(self.__file_stack), + self.__file_stack[-1])) + self.__file_stack.pop() + else: + self.__message('warning', None, + "extra `)' in log; file names may be wrong ") + elif ch == '{': + # TeX uses this for various things we want to ignore, like + # file names and print_mark. Consume up to the '}' + epos = self.__data.find('}', self.__pos) + if epos != -1: + self.__pos = epos + 1 + else: + self.__message('warning', None, + "unbalanced `{' in log; file names may be wrong") + elif ch == '}': + self.__message('warning', None, + "extra `}' in log; file names may be wrong") + + def __filename(self): + initcol = self.__col + first = True + name = '' + # File names may wrap, but if they do, TeX will always print a + # newline before the open paren + while first or (initcol == 1 and self.__lookingat('\n') + and self.__col >= 79): + if not first: + self.__pos += 1 + m = self.__lookingatre(r'[^(){} \n]*') + name += m.group() + self.__pos = m.end() + first = False + return name + + def __simplify_message(self, msg): + msg = re.sub(r'^(?:Package |Class |LaTeX |pdfTeX )?([^ ]+) (?:Error|Warning): ', + r'[\1] ', msg, flags=re.I) + msg = re.sub(r'\.$', '', msg) + msg = re.sub(r'has occurred (while \\output is active)', r'\1', msg) + return msg + + def __errmessage(self): + # Procedure print_err (including \errmessage, itself used by + # LaTeX's \GenericError and all of its callers), as well as + # fatal_error. Prints "\n! " followed by error text + # ("Emergency stop" in the case of fatal_error). print_err is + # always followed by a call to error, which prints a period, + # and a newline... + msg = self.__consume_line(unwrap=True)[1:].strip() + is_fatal_error = (msg == 'Emergency stop.') + msg = self.__simplify_message(msg) + # ... and then calls show_context, which prints the input + # stack as pairs of lines giving the context. These context + # lines are truncated so they never wrap. Each pair of lines + # will start with either " " if the context is a + # token list, "<*> " for terminal input (or command line), + # "" for stream reads, something like "\macroname + # #1->" for macros (though everything after \macroname is + # subject to being elided as "..."), or "l.[0-9]+ " if it's a + # file. This is followed by the errant input with a line + # break where the error occurred. + lineno = None + found_context = False + stack = [] + while self.__avail: + m1 = self.__lookingatre(r'<([a-z ]+|\*|read [^ >]*)> |\\.*(->|...)') + m2 = self.__lookingatre('l\.[0-9]+ ') + if m1: + found_context = True + pre = self.__consume_line().rstrip('\n') + stack.append(pre) + elif m2: + found_context = True + pre = self.__consume_line().rstrip('\n') + info, rest = pre.split(' ', 1) + lineno = int(info[2:]) + stack.append(rest) + elif found_context: + # Done with context + break + if found_context: + # Consume the second context line + post = self.__consume_line().rstrip('\n') + # Clean up goofy trailing ^^M TeX sometimes includes + post = re.sub(r'\^\^M$', '', post) + if post[:len(pre)].isspace() and not post.isspace(): + stack.append(len(stack[-1])) + stack[-2] += post[len(pre):] + else: + # If we haven't found the context, skip the line. + self.__skip_line() + stack_msg = '' + for i, trace in enumerate(stack): + stack_msg += ('\n ' + (' ' * trace) + '^' + if isinstance(trace, int) else + '\n at ' + trace.rstrip() if i == 0 else + '\n from ' + trace.rstrip()) + + if is_fatal_error: + # fatal_error always prints one additional line of message + info = self.__consume_line().strip() + if info.startswith('*** '): + info = info[4:] + msg += ': ' + info.lstrip('(').rstrip(')') + + self.__message('error', lineno, msg + stack_msg) + self.__fatal_error = True + + def __pdftex_fail(self): + # Procedure pdftex_fail. Prints "\n!pdfTeX error: ", the + # message, and a newline. Unlike print_err, there's never + # context. + msg = self.__consume_line(unwrap=True)[1:].strip() + msg = self.__simplify_message(msg) + self.__message('error', None, msg) + + def __runaway(self): + # Procedure runaway. Prints "\nRunaway ...\n" possibly + # followed by token list (user text). Always followed by a + # call to print_err, so skip lines until we see the print_err. + self.__skip_line() # Skip "Runaway ...\n" + if not self.__lookingat('! ') and self.__avail: + # Skip token list, which is limited to one line + self.__skip_line() + + def __bad_box(self): + # Function hpack and vpack. hpack prints a warning, a + # newline, then a short_display of the offending text. + # Unfortunately, there's nothing indicating the end of the + # offending text, but it should be on one (possible wrapped) + # line. vpack prints a warning and then, *unless output is + # active*, a newline. The missing newline is probably a bug, + # but it sure makes our lives harder. + origpos = self.__pos + msg = self.__consume_line() + m = re.search(r' in (?:paragraph|alignment) at lines ([0-9]+)--([0-9]+)', msg) or \ + re.search(r' detected at line ([0-9]+)', msg) + if m: + # Sometimes TeX prints crazy line ranges like "at lines + # 8500--250". The lower number seems roughly sane, so use + # that. I'm not sure what causes this, but it may be + # related to shipout routines messing up line registers. + lineno = min(int(m.group(1)), int(m.groups()[-1])) + msg = msg[:m.start()] + else: + m = re.search(r' while \\output is active', msg) + if m: + lineno = None + msg = msg[:m.end()] + else: + self.__message('warning', None, + 'malformed bad box message in log') + return + # Back up to the end of the known message text + self.__pos = origpos + m.end() + if self.__lookingat('\n'): + # We have a newline, so consume it and look for the + # offending text. + self.__pos += 1 + # If there is offending text, it will start with a font + # name, which will start with a \. + if 'hbox' in msg and self.__lookingat('\\'): + self.__consume_line(unwrap=True) + msg = self.__simplify_message(msg) + ' (page {})'.format(self.__pageno) + cls = msg.split(None, 1)[0].lower() + self.__message('warning', lineno, msg, cls=cls) + + def __generic_warning(self): + # Warnings produced by LaTeX's \GenericWarning (which is + # called by \{Package,Class}Warning and \@latex@warning), + # warnings produced by pdftex_warn, and other random warnings. + msg, cls = self.__generic_info() + # Most warnings include an input line emitted by \on@line + m = re.search(' on input line ([0-9]+)', msg) + if m: + lineno = int(m.group(1)) + msg = msg[:m.start()] + else: + lineno = None + msg = self.__simplify_message(msg) + self.__message('warning', lineno, msg, cls=cls) + + def __generic_info(self): + # Messages produced by LaTeX's \Generic{Error,Warning,Info} + # and things that look like them + msg = self.__consume_line(unwrap=True).strip() + # Package and class messages are continued with lines + # containing '(package name) ' + pkg_name = msg.split(' ', 2)[1] + prefix = '(' + pkg_name + ') ' + while self.__lookingat(prefix): + # Collect extra lines. It's important that we keep these + # because they may contain context information like line + # numbers. + extra = self.__consume_line(unwrap=True) + msg += ' ' + extra[len(prefix):].strip() + return msg, pkg_name.lower() + +################################################################## +# BibTeX task +# + +class BibTeX(Task): + def __init__(self, db, latex_task, cmd, cmd_args, nowarns, obj_dir): + super().__init__(db, 'bibtex::' + normalize_input_path( + latex_task.get_tex_filename())) + self.__latex_task = latex_task + self.__cmd = cmd + self.__cmd_args = cmd_args + self.__obj_dir = obj_dir + + def stable(self): + # If bibtex doesn't have its inputs, then it's stable because + # it has no effect on system state. + jobname = self.__latex_task.get_jobname() + if jobname is None: + # We don't know where the .aux file is until latex has run + return True + if not os.path.exists(jobname + '.aux'): + # Input isn't ready, so bibtex will simply fail without + # affecting system state. Hence, this task is trivially + # stable. + return True + if not self.__find_bib_cmds(os.path.dirname(jobname), jobname + '.aux'): + # The tex file doesn't refer to any bibliographic data, so + # don't run bibtex. + return True + + return super().stable() + + def __find_bib_cmds(self, basedir, auxname, stack=()): + debug('scanning for bib commands in {}'.format(auxname)) + if auxname in stack: + raise TaskError('.aux file loop') + stack = stack + (auxname,) + + try: + aux_data = open(auxname, errors='surrogateescape').read() + except FileNotFoundError: + # The aux file may not exist if latex aborted + return False + if re.search(r'^\\bibstyle\{', aux_data, flags=re.M) or \ + re.search(r'^\\bibdata\{', aux_data, flags=re.M): + return True + + if re.search(r'^\\abx@aux@cite\{', aux_data, flags=re.M): + # biber citation + return True + + # Recurse into included aux files (see aux_input_command), in + # case \bibliography appears in an \included file. + for m in re.finditer(r'^\\@input\{([^}]*)\}', aux_data, flags=re.M): + if self.__find_bib_cmds(basedir, os.path.join(basedir, m.group(1)), + stack): + return True + + return False + + def _input_args(self): + if self.__is_biber(): + aux_name = os.path.basename(self.__latex_task.get_jobname()) + else: + aux_name = os.path.basename(self.__latex_task.get_jobname()) + '.aux' + return [self.__cmd] + self.__cmd_args + [aux_name] + + def _input_cwd(self): + return os.path.dirname(self.__latex_task.get_jobname()) + + def _input_auxfile(self, auxname): + # We don't consider the .aux files regular inputs. + # Instead, we extract just the bit that BibTeX cares about + # and depend on that. See get_aux_command_and_process in + # bibtex.web. + debug('hashing filtered aux file {}', auxname) + try: + with open(auxname, 'rb') as aux: + h = hashlib.sha256() + for line in aux: + if line.startswith((b'\\citation{', b'\\bibdata{', + b'\\bibstyle{', b'\\@input{', + b'\\abx@aux@cite{')): + h.update(line) + return h.hexdigest() + except FileNotFoundError: + debug('{} does not exist', auxname) + return None + + def __path_join(self, first, rest): + if rest is None: + # Append ':' to keep the default search path + return first + ':' + return first + ':' + rest + + def __is_biber(self): + return "biber" in self.__cmd + + def _execute(self): + # This gets complicated when \include is involved. \include + # switches to a different aux file and records its path in the + # main aux file. However, BibTeX does not consider this path + # to be relative to the location of the main aux file, so we + # have to run BibTeX *in the output directory* for it to + # follow these includes (there's no way to tell BibTeX other + # locations to search). Unfortunately, this means BibTeX will + # no longer be able to find local bib or bst files, but so we + # tell it where to look by setting BIBINPUTS and BSTINPUTS + # (luckily we can control this search). We have to pass this + # same environment down to Kpathsea when we resolve the paths + # in BibTeX's log. + args, cwd = self._input('args'), self._input('cwd') + debug('running {} in {}', args, cwd) + + env = os.environ.copy() + env['BIBINPUTS'] = self.__path_join(os.getcwd(), env.get('BIBINPUTS')) + env['BSTINPUTS'] = self.__path_join(os.getcwd(), env.get('BSTINPUTS')) + + try: + verbose_cmd(args, cwd, env) + p = subprocess.Popen(args, cwd=cwd, env=env, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + stdout = self.__feed_terminal(p.stdout) + status = p.wait() + except OSError as e: + raise TaskError('failed to execute bibtex task: ' + str(e)) from e + + inputs, auxnames, outbase = self.__parse_inputs(stdout, cwd, env) + if not inputs and not auxnames: + # BibTeX failed catastrophically. + print(stdout, file=sys.stderr) + raise TaskError('failed to execute bibtex task') + + # Register environment variable inputs + for env_var in ['TEXMFOUTPUT', 'BSTINPUTS', 'BIBINPUTS', 'PATH']: + self._input('env', env_var) + + # Register file inputs + for path in auxnames: + self._input('auxfile', path) + for path in inputs: + self._input('file', path) + + if self.__is_biber(): + outbase = os.path.join(cwd, outbase) + outputs = [outbase + '.bbl', outbase + '.blg'] + return RunResult(outputs, {'outbase': outbase, 'status': status, + 'inputs': inputs}) + + def __feed_terminal(self, stdout): + with Progress('bibtex') as progress: + buf, linebuf = [], '' + while True: + data = os.read(stdout.fileno(), 4096) + if not data: + break + # See "A note about encoding" above + data = data.decode('ascii', errors='surrogateescape') + buf.append(data) + linebuf += data + while '\n' in linebuf: + line, _, linebuf = linebuf.partition('\n') + if line.startswith('Database file'): + progress.update(line.split(': ', 1)[1]) + return ''.join(buf) + + def __parse_inputs(self, log, cwd, env): + # BibTeX conveniently logs every file that it opens, and its + # log is actually sensible (see calls to a_open_in in + # bibtex.web.) The only trick is that these file names are + # pre-kpathsea lookup and may be relative to the directory we + # ran BibTeX in. + # + # Because BibTeX actually depends on very little in the .aux + # file (and it's likely other things will change in the .aux + # file), we don't count the whole .aux file as an input, but + # instead depend only on the lines that matter to BibTeX. + kpathsea = Kpathsea('bibtex') + inputs = [] + auxnames = [] + outbase = None + for line in log.splitlines(): + m = re.match('(?:The top-level auxiliary file:' + '|A level-[0-9]+ auxiliary file:) (.*)', line) + if m: + auxnames.append(os.path.join(cwd, m.group(1))) + continue + m = re.match('(?:(The style file:)|(Database file #[0-9]+:)) (.*)', + line) + if m: + filename = m.group(3) + if m.group(1): + filename = kpathsea.find_file(filename, 'bst', cwd, env) + elif m.group(2): + filename = kpathsea.find_file(filename, 'bib', cwd, env) + + # If this path is relative to the source directory, + # clean it up for error reporting and portability of + # the dependency DB + if filename.startswith('/'): + relname = os.path.relpath(filename) + if '../' not in relname: + filename = relname + + inputs.append(filename) + + # biber output + m = re.search("Found BibTeX data source '(.*?)'", + line) + if m: + filename = m.group(1) + inputs.append(filename) + + m = re.search("Logfile is '(.*?)'", line) + if m: + outbase = m.group(1)[:-4] + + if outbase is None: + outbase = auxnames[0][:-4] + + return inputs, auxnames, outbase + + def report(self): + extra = self._get_result_extra() + if extra is None: + return 0 + + # Parse and pretty-print the log + log = open(extra['outbase'] + '.blg', 'rt').read() + inputs = extra['inputs'] + for msg in BibTeXFilter(log, inputs).get_messages(): + msg.emit() + + # BibTeX exits with 1 if there are warnings, 2 if there are + # errors, and 3 if there are fatal errors (sysdep.h). + # Translate to a normal UNIX exit status. + if extra['status'] >= 2: + return 1 + return 0 + +class BibTeXFilter: + def __init__(self, data, inputs): + self.__inputs = inputs + self.__key_locs = None + + self.__messages = [] + + prev_line = '' + for line in data.splitlines(): + msg = self.__process_line(prev_line, line) + if msg is not None: + self.__messages.append(Message(*msg)) + prev_line = line + + def get_messages(self): + """Return a list of warning and error Messages.""" + # BibTeX reports most errors in no particular order. Sort by + # file and line. + return sorted(self.__messages, + key=lambda msg: (msg.filename or '', msg.lineno or 0)) + + def __process_line(self, prev_line, line): + m = None + def match(regexp): + nonlocal m + m = re.match(regexp, line) + return m + + # BibTeX has many error paths, but luckily the set is closed, + # so we can find all of them. This first case is the + # workhorse format. + # + # AUX errors: aux_err/aux_err_return/aux_err_print + # + # BST errors: bst_ln_num_print/bst_err/ + # bst_err_print_and_look_for_blank_line_return/ + # bst_warn_print/bst_warn/ + # skip_token/skip_token_print/ + # bst_ext_warn/bst_ext_warn_print/ + # bst_ex_warn/bst_ex_warn_print/ + # bst_mild_ex_warn/bst_mild_ex_warn_print/ + # bst_string_size_exceeded + # + # BIB errors: bib_ln_num_print/ + # bib_err_print/bib_err/ + # bib_warn_print/bib_warn/ + # bib_one_of_two_expected_err/macro_name_warning/ + if match('(.*?)---?line ([0-9]+) of file (.*)'): + # Sometimes the real error is printed on the previous line + if m.group(1) == 'while executing': + # bst_ex_warn. The real message is on the previous line + text = prev_line + else: + text = m.group(1) or prev_line + typ, msg = self.__canonicalize(text) + return (typ, m.group(3), int(m.group(2)), msg) + + # overflow/print_overflow + if match('Sorry---you\'ve exceeded BibTeX\'s (.*)'): + return ('error', None, None, 'capacity exceeded: ' + m.group(1)) + # confusion/print_confusion + if match('(.*)---this can\'t happen$'): + return ('error', None, None, 'internal error: ' + m.group(1)) + # aux_end_err + if match('I found (no .*)---while reading file (.*)'): + return ('error', m.group(2), None, m.group(1)) + # bad_cross_reference_print/ + # nonexistent_cross_reference_error/ + # @ + # + # This is split across two lines. Match the second. + if match('^refers to entry "'): + typ, msg = self.__canonicalize(prev_line + ' ' + line) + msg = re.sub('^a (bad cross reference)', '\\1', msg) + # Try to give this key a location + filename = lineno = None + m2 = re.search(r'--entry "[^"]"', prev_line) + if m2: + filename, lineno = self.__find_key(m2.group(1)) + return (typ, filename, lineno, msg) + # print_missing_entry + if match('Warning--I didn\'t find a database entry for (".*")'): + return ('warning', None, None, + 'no database entry for ' + m.group(1)) + # x_warning + if match('Warning--(.*)'): + # Most formats give warnings about "something in ". + # Try to match it up. + filename = lineno = None + for m2 in reversed(list(re.finditer(r' in ([^, \t\n]+)\b', line))): + if m2: + filename, lineno = self.__find_key(m2.group(1)) + if filename: + break + return ('warning', filename, lineno, m.group(1)) + # @ + if match('Aborted at line ([0-9]+) of file (.*)'): + return ('info', m.group(2), int(m.group(1)), 'aborted') + + # biber type errors + if match('^.*> WARN - (.*)$'): + print ('warning', None, None, m.group(1)) + m2 = re.match("(.*) in file '(.*?)', skipping ...", m.group(1)) + if m2: + return ('warning', m2.group(2), "0", m2.group(1)) + return ('warning', None, None, m.group(1)) + + if match('^.*> ERROR - (.*)$'): + m2 = re.match("BibTeX subsystem: (.*?), line (\d+), (.*)$", m.group(1)) + if m2: + return ('error', m2.group(1), m2.group(2), m2.group(3)) + return ('error', None, None, m.group(1)) + + + def __canonicalize(self, msg): + if msg.startswith('Warning'): + msg = re.sub('^Warning-*', '', msg) + typ = 'warning' + else: + typ = 'error' + msg = re.sub('^I(\'m| was)? ', '', msg) + msg = msg[:1].lower() + msg[1:] + return typ, msg + + def __find_key(self, key): + if self.__key_locs is None: + p = BibTeXKeyParser() + self.__key_locs = {} + for filename in self.__inputs: + data = open(filename, 'rt', errors='surrogateescape').read() + for pkey, lineno in p.parse(data): + self.__key_locs.setdefault(pkey, (filename, lineno)) + return self.__key_locs.get(key, (None, None)) + +class BibTeXKeyParser: + """Just enough of a BibTeX parser to find keys.""" + + def parse(self, data): + IDENT_RE = '(?![0-9])([^\x00-\x20\x80-\xff \t"#%\'(),={}]+)' + self.__pos, self.__data = 0, data + # Find the next entry + while self.__consume('[^@]*@[ \t\n]*'): + # What type of entry? + if not self.__consume(IDENT_RE + '[ \t\n]*'): + continue + typ = self.__m.group(1) + if typ == 'comment': + continue + start = self.__pos + if not self.__consume('([{(])[ \t\n]*'): + continue + closing, key_re = {'{' : ('}', '([^, \t\n}]*)'), + '(' : (')', '([^, \t\n]*)')}[self.__m.group(1)] + if typ not in ('preamble', 'string'): + # Regular entry; get key + if self.__consume(key_re): + yield self.__m.group(1), self.__lineno() + # Consume body of entry + self.__pos = start + self.__balanced(closing) + + def __consume(self, regexp): + self.__m = re.compile(regexp).match(self.__data, self.__pos) + if self.__m: + self.__pos = self.__m.end() + return self.__m + + def __lineno(self): + return self.__data.count('\n', 0, self.__pos) + 1 + + def __balanced(self, closing): + self.__pos += 1 + level = 0 + skip = re.compile('[{}' + closing + ']') + while True: + m = skip.search(self.__data, self.__pos) + if not m: + break + self.__pos = m.end() + ch = m.group(0) + if level == 0 and ch == closing: + break + elif ch == '{': + level += 1 + elif ch == '}': + level -= 1 + +class Kpathsea: + def __init__(self, program_name): + self.__progname = program_name + + def find_file(self, name, format, cwd=None, env=None): + """Return the resolved path of 'name' or None.""" + + args = ['kpsewhich', '-progname', self.__progname, '-format', format, + name] + try: + verbose_cmd(args, cwd, env) + path = subprocess.check_output( + args, cwd=cwd, env=env, universal_newlines=True).strip() + except subprocess.CalledProcessError as e: + if e.returncode != 1: + raise + return None + if cwd is None: + return path + return os.path.join(cwd, path) + +if __name__ == "__main__": + main()