#! /usr/bin/env python
#
# @file   tex-acronyms.py
# @author Aleix Conchillo Flaque <aconchillo@gmail.com>
# @date   Fri Jul 28, 2005 12:05
#
# This script parses the acronyms in the specified LaTeX file and
# tries to find a description (specified as a nomencl definition) in
# the files (*.tex) located in the data directory. The script will
# create two files: one with acronyms and their descriptions and the
# other with conflicts, that is, acronyms with no or duplicate
# description.
#
# The following example is a sample of an acronyms list file. Note
# that the LaTeX document must use the nomencl package, since the
# acronyms are defined using the nomencl syntax.
#
# \nomenclature{ADDR}{Address}
# \nomenclature{ANSI}{American National Standards Institute}
# \nomenclature{API}{Application Programming Interface}
# ...
#
# It is also possible to provide a list of words to be excluded in the
# "exclude_acronyms.txt" file, that must be located in the data
# directory by default (a different file can be specified using
# "-x"). For example:
#
# CHAPTERS
# CLOSED
# DUPLICATE
# FIRST
# FIXED
# BOTH
# BUS
# ...
#
# It is easy to find words to be excluded because they will be treated
# as errors (and written to the errors file) as no definition is
# available for them.
#
# Finally, it is possible to parse the input files recursively passing
# the "-r" argument. This will parse all the files included with
# "\input".
#
# A possible program call could be:
#
# acronyms.py -r -d ~/acronyms -i article.tex -o acronyms.tex -e acronyms.errors
#
# where the acronyms found in the article.tex file (and its
# dependencies) will be matched against the acronyms found in
# ~/acronyms/*.tex. The matched acronyms will be written to
# acronyms.tex and the errors in acronyms.errors.
#

import getopt
import glob
import os
import re
import sys

def add_acronyms(acronyms, new_acronyms):
    for key in new_acronyms:
        if key in acronyms:
            acronyms[key].append(new_acronyms[key])
        else:
            acronyms[key] = [new_acronyms[key]]

def tex_input_files(filename):
    filenames = []
    p = re.compile(r"^\\input\{(.*)\}")
    input = open(filename, "r")
    line = input.readline()
    while line:
        m = p.search(line)
        if m:
            filenames.append(m.group(1))
        line = input.readline()
    return filenames

def read_exclude_acronyms_file(filename):
    try:
        input = open(filename, "r")
        acronyms = [l.strip() for l in input.readlines()]
        input.close()
    except IOError:
        print "\nWarning: exclude acronyms file %s not found." % filename
        acronyms = []
    return dict.fromkeys(acronyms)

def parse_acronyms_file(filename):
    acronyms = {}
    p = re.compile(r"^\\nomenclature\{(.*)\}\{(.*)\}")
    input = open(filename, "r")
    line = input.readline()
    while line:
        m = p.match(line)
        if m:
            acronym = m.group(1)
            definition = m.group(2)
            acronyms[acronym] = definition
        line = input.readline()
    input.close()
    return acronyms

def append_acronyms_file(filename, exclude_acronyms, skip_filename,
                         recursive, acronyms):
    if (filename == skip_filename) or (filename + ".tex" == skip_filename):
        return

    p1 = re.compile(r"^([A-Z]{2,})\W")
    p2 = re.compile(r"\W([A-Z]{2,})\W")
    if not os.path.exists(filename):
        filename = filename + ".tex"
    print "- Parsing %s" % filename
    input = open(filename, "r")
    line = input.readline()
    while line:
        matches = []
        m1 = p1.findall(line)
        m2 = p2.findall(line)
        if m1:
            matches.extend(m1)
        if m2:
            matches.extend(m2)
        if len(matches) > 0:
            for acronym in matches:
                if not acronym in exclude_acronyms and not acronym in acronyms:
                    acronyms[acronym] = acronym
        line = input.readline()
    input.close()
    if recursive:
        for f in tex_input_files(filename):
            append_acronyms_file(f, exclude_acronyms, skip_filename,
                                 recursive, acronyms)

def intersect_acronyms(acronyms_a, acronyms_b):
    acronyms_int = {}
    acronyms_dup = {}
    acronyms = acronyms_b.keys()
    acronyms.sort()
    for acronym in acronyms:
        if acronym in acronyms_a:
            if len(acronyms_a[acronym]) > 1:
                acronyms_dup[acronym] = acronyms_a[acronym]
            else:
                acronyms_int[acronym] = acronyms_a[acronym][0]
    return acronyms_int, acronyms_dup

### Start application

arguments, arguments_left = getopt.getopt(sys.argv[1:], "rd:i:o:e:x:")

max_arguments = 4
for argument in arguments:
    if (argument[0] == "-r") or (argument[0] == "-x"):
        max_arguments += 1

if len(arguments) < max_arguments:
    print "\nUsage: %s [-r] -d acronyms_dir -i main_file " \
          "-o acronyms_file -e conflicts_file [-x exclude_file]\n" \
          % sys.argv[0]
    sys.exit(1)

acronyms_exc_filename = "exclude_acronyms.txt"

recursive = False
exclude = False
for argument in arguments:
    if argument[0] == "-r":
        recursive = True
    if argument[0] == "-d":
        acronyms_dir = argument[1]
    if argument[0] == "-i":
        main_filename = argument[1]
    if argument[0] == "-o":
        acronyms_tex_filename = argument[1]
    if argument[0] == "-e":
        acronyms_err_filename = argument[1]
    if argument[0] == "-x":
        exclude = True
        acronyms_exc_filename = argument[1]

global_acronyms = {}

if not exclude:
    acronyms_exc_filename = os.path.join(acronyms_dir, acronyms_exc_filename)
exclude_acronyms = read_exclude_acronyms_file(acronyms_exc_filename)

## Acronyms read from .tex files in directory
acronyms_files = glob.glob(os.path.join(acronyms_dir, "*.tex"))
for filename in acronyms_files:
  acronyms = parse_acronyms_file(filename)
  add_acronyms(global_acronyms, acronyms)

print

file_acronyms = {}
append_acronyms_file(main_filename, exclude_acronyms,
                     acronyms_tex_filename,
                     recursive, file_acronyms)

print

acronyms_int, acronyms_dup = intersect_acronyms(global_acronyms, file_acronyms)

## Acronyms list

output = open(acronyms_tex_filename, "w")
acronyms = acronyms_int.keys()
acronyms.sort()
for acronym in acronyms:
    output.write("\\nomenclature{%s}{%s}\n" % (acronym,
                                               acronyms_int[acronym]))
output.close()

## Acronyms confilcts

# Create TBD acronyms list
acronyms_nul = []
acronyms = file_acronyms.keys()
acronyms.sort()
for acronym in acronyms:
    if not acronym in global_acronyms:
        acronyms_nul.append(acronym)

if (len(acronyms_nul) > 0) or (len(acronyms_dup.keys()) > 0):
    output = open(acronyms_err_filename, "w")

    # Duplicate acronyms
    acronyms = acronyms_dup.keys()
    acronyms.sort()
    for acronym in acronyms:
        output.write("\\nomenclature{%s}{%s}\n" % (acronym,
                                                   acronyms_dup[acronym]))

    # TBD acronyms
    for acronym in acronyms_nul:
        output.write("\\nomenclature{%s}{}\n" % acronym)

    output.close()
