#!/usr/bin/env python
# escapeTeXt.py
#  Convert text to something LaTeX is less likely to choke on.
# 2006-Aug-24 Jim Hefferon  Written.  Based on stuff from Tristan Miller and
#  Allin Cottrell

# Like txt2latex but beyond being callable from the command line
#  ./escapeTeXt.py < file1.txt > file1.ltx
# it is callable from a program as a function.
#  import escapeTeXt
#  result=escapeTeXt.escapetext(string).output()

# TODO
# * at sign?
# * copyright, trademark?
# * allow conversion of "_Great Gatsby_" to "\textit{Great Gatsby}"
# * allow conversion of "I *really* do" to "I \emph{really} do"  
# * add refinement of recognizing fractions "2/3" ==> "$2/3$"?

import sys, re
from getopt import getopt, GetoptError

DEBUG=False  # currently unused

class escapetextError(StandardError): # currently unused
    pass

class escapetext(object):
    """Turn plain text into strings that have a chance of making it through
    LaTeX.
    """
    def __init__(self,txt='',countQuotes=True,refinements=False):
        """Initialize instance.  Build up the text in parts (say, as lines
        from the input source) with this initializing, and using the feed()
        routine.  Then we dump it with output().
          txt=''  Text to start with
          countQuotes=True  Should we try to match open with closed quotes?
          refinements=False  Should we try to get fancy?  (Try to fix
            en dashes, em dashes, the 'p. 2' construct.)
        """
        self.countQuotes=countQuotes
        self.refinements=refinements
        self.singleQuoteCount=0
        self.doubleQuoteCount=0
        self.strings=[txt]  # array of strings that we have been fed

    def feed(self,txt):
        """Add more text to the pile.
          txt   Text to add
        """
        self.strings.append(txt)

    pageNumberPattern="(\s)(p.)(\s)(\d)"  # like 'p. 23'
    pageNumberRE=re.compile(pageNumberPattern)
    enDashPattern="(\d)-(\d)"  # like '7-9'
    enDashRE=re.compile(enDashPattern)
    emDashPattern="((\D)-{1,2}(\D))"  # like 'but-no'
    emDashRE=re.compile(emDashPattern)
    def texifyString(self,s):
        """Convert a string to a form more acceptable to LaTeX.
          s  Plain text string.
        """
        # Do simple substitutions
        for (old,new) in [(u"\\",u"{\\textbackslash}"),  # backslashes
                          (u"{",u"\\{"),        # open curly braces
                          (u"}",u"\\}"),        # close curly braces
                          (u"\\{\\textbackslash\\}",u"{\\textbackslash}"), # have to fix up the effect of the prior two on the first line's result
                          (u"$",u"\\$"),      # dollar signs
                          (u"%",u"\\%"),        # percent signs
                          (u"_",u"\\_"),        # underscores
                          (u"&",u"\\&"),        # ampersands
                          (u"#",u"\\#"),        # sharp signs
                          (u"<",u"{\\textless}"),        # less than
                          (u">",u"{\\textgreater}"),        # greater than
                          ]:         
            s=s.replace(old,new)
        if self.countQuotes:  # make even-numbered instances open, odd closed
            tmpS=""
            priorC=None
            for c in s:
                if c=="'":
                    if (self.singleQuoteCount % 2)==0:
                        if priorC=='"':  # open double quote followed by open single quote
                            tmpS+=u"\,"  # add a thinspace
                        tmpS+=u"`" # open single quote
                    else:
                        tmpS+=u"'" # close single quote
                    self.singleQuoteCount+=1
                elif c=='"':
                    if (self.doubleQuoteCount % 2)==0:
                        tmpS+=u"``" # open double quotes
                    else:
                        if priorC=="'":  # single close quote followed by double close quote
                            tmpS+=u"\,"  # add a thinspace
                        tmpS+=u"''" # close double quotes
                    self.doubleQuoteCount+=1
                else:
                    tmpS+=c
                priorC=c
            s=tmpS 
        else:
            s=s.replace(u'`',u"\verb!'!") #  maybe u"\textquotesingle" if you \usepackage{textcomp}?
            s=s.replace(u'"',u'\verb!"!') # maybe u"$^\\textquotestraightdblbase$" if you \usepackage{textcomp}?
        if self.refinements:
            s=escapetext.pageNumberRE.sub(r'\1\2\,\4',s)  # replace " p. 2" with " p.\,2"
            s=escapetext.enDashRE.sub(r'\1--\2',s)  # replace "2-3" with "2--3"
            s=escapetext.emDashRE.sub(r'\2---\3',s)  # replace "but -- no" with "but --- no"
        return s

    def output(self,restartCounts=False):
        """Return a string that has the parts escaped.  That clears the
        internal buffer.
          restartCounts=False  Reset the counters for whether a single or
            double quote is opening or closed
        """
        r=map(self.texifyString,self.strings)
        self.strings=[]
        if restartCounts:
            self.singleQuoteCount=0
            self.doubleQuoteCount=0
        return "".join(r)

latexHead="""\\documentclass{article}
\\begin{document}
"""
latexFoot="""\\end{document}"""

#............... script main body
if __name__=='__main__':
    # parse args
    inputFilename=None
    outputFilename=None
    countQuotes=True
    refinements=True
    latexWrap=False
    verbose=False  # currently unused

    usage="""%s: Convert plain text so it may make it through LaTeX 
  %s [options] 
where the options are
  -f filename (default %s) file to read from; if None then stdin is used
  -o filename (default %s) file to write to; if None then stdout is used
  -c Turn off the attempt to balance open and closed quotes
  -r Turn off the refinements (em dashes, en dashes ..)
  -l Add simple LaTeX header and footer 
  -v (default %s) sets verbose output
  --help or -?  Give this usage statement""" % (sys.argv[0],sys.argv[0],repr(inputFilename),repr(outputFilename),repr(verbose))

    shortOptions='f:o:lcr?v'
    longOptions=['help']
    try:
        (opts,args_proper)=getopt(sys.argv[1:],shortOptions,longOptions)
    except GetoptError, err:
        print "ERROR: Unable to parse the command line arguments: %s" % (err,)
        print usage
        sys.exit(1)
    for (option,parameter) in opts:
        if option=='-f':
            inputFilename=parameter
        elif option=='-o':
            outputFilename=parameter
        elif option=='-c':
            countQuotes=False
        elif option=='-r':
            refinements=False
        elif option=='-l':
            latexWrap=True
        elif option=='-v':
            verbose=True
        elif (option=='-?'
              or option=='--help'):
            print usage
            sys.exit(0)
        else:
            print "Unknown option: ",option
            sys.exit(2)
    # Done getting options; now the logic
    if (inputFilename is None):
        inFile=sys.stdin
    else:
        inFile=open(inputFilename,'r')
    if (outputFilename is None):
        outFile=sys.stdout
    else:
        outFile=open(outputFilename,'w')

    et=escapetext(countQuotes=countQuotes,refinements=refinements)
    line=inFile.readline()
    if latexWrap:
        outFile.write(latexHead)
    while line:
        et.feed(line)
        outFile.write(et.output())        
        line=inFile.readline()
    if latexWrap:
        outFile.write(latexFoot)
    inFile.close()
    outFile.close()

    if verbose:
        print "#s: done" % (sys.argv[0],)