#!/usr/bin/python

### FILE: topic-sentences.py (Version 2.0)
### Copyright (c) 2023 by Gene Cooperman (gene@ccs.neu.edu)
### This file may be freely copied and modified as long as
### this copyright notice remains.

### With this script, you can now just write whatever you find interesting.
### Don't worry about filling in a formal outline.
### 
### The outline is now the _output_ of your text -- not the input.  Instead of
### outlining and then writing, you can now just write, and then read this
### automatically generated outline to see if your "story" sounds right.
### If the "story" doesn't sound right, then that's a "bug" in your writing
### (similar to a "bug" in a computer program).  So, the outline shows you the
### location of the "bug", and it should then be easy to go back and revise
### your writing to fix it.  And after that, you can automatically regenerate
### the outline to check that the bug has been fixed.


### FIXME:  The pandoc command as a filter for '-t pdf':
###         To create a pdf using pandoc, use -t latex|beamer|context|ms|html5
###         and specify an output file with .pdf extension (-o filename.pdf).
### FIXME:  When we create --markdown, the .md retains definitions like '\name'
###         So, then when converting back to latex, we have to fix that,
###           or else put back the "\def" lines from the user.
### FIXME: pdf extraction? https://stackoverflow.com/questions/42093548/splitting-pdf-files-into-paragraphs
###        https://stackoverflow.com/questions/61540569/how-to-reorganize-a-pdf-file-into-paragraph
###        https://stackoverflow.com/questions/76020709/detecting-paragraphs-in-a-pdf


import sys
import os
import textwrap
import subprocess
import re

# If true, we either output markdown or else use pandoc on it.
is_latex = [arg for arg in sys.argv if arg.endswith(".tex")]
to_latex = {"--pdf", "--latex"}.intersection(sys.argv)
to_markdown = {"--markdown", "--html", "--pdf", "--latex"}
if is_latex:
  to_markdown -= {"--latex", "--pdf"} # latex to latex/pdf doesn't use pandoc
to_markdown = to_markdown.intersection(sys.argv)

out_flags = {"--markdown", "--html", "--text", "--latex", "--pdf"}
out_args = set(sys.argv) - out_flags
if len(out_args) != 2 or not out_flags.intersection(sys.argv):
  print("USAGE: " + sys.argv[0] + " [" + "|".join(sorted(out_flags)) + "]"
                    " <INPUT_FILE.[md|tex]>\n")
  sys.exit(1)

# ARGS: --text, --html, --pdf, --markdown, --latex
# Remove "--text", since it's the default.  --markdown doesn't use pandoc.
use_pandoc = [arg.replace("latex", "tex") for arg in sys.argv
                 if arg.startswith("-") and arg not in ["--text", "--markdown"]]
sys.argv = [arg for arg in sys.argv if not arg.startswith("-")]

def latex_title(file):
  title = ""
  latex_file = open(file, "r")
  latex_code = latex_file.read()
  latex_file.close()
  if "\\title{" in latex_code:
    title = latex_code[latex_code.index("\\title{")+len("\\title{"):] \
            .split("}", 1)[0]
  return title

## FIXME:  Also support '###' or general case.
section = "0.0.0"
def section_num(line):
  global section
  sect = [int(x) for x in section.split('.')]
  level = len(line) - len(line.lstrip('#'))
  if level == 0 and line[0].startswith("\\\section"):
    is_latex = True
    level = 1
  elif level == 0 and line[0].startswith("\\\subsection"):
    is_latex = True
    level = 2
  if level > 0:
    sect[level-1] += 1
    for i in range(level, len(sect)): sect[i] = 0
  section = '.'.join([str(x) for x in sect])
  output = section
  while output.endswith(".0"):  output = output[:-2]
  return output
def topic_sentence(paragraph):
  # A paragraph could begin with "  \begin{itemize}".  If so, remove it.
  if paragraph.lstrip().startswith("\\begin"):
    paragraph = ""
  paragraph = paragraph.replace("et. al.", "et al").replace("et al.", "et al")
  pattern = r"(\b[A-Z][a-z]+)\."
  while re.search(pattern, paragraph):
    paragraph = re.sub(pattern, r"\1", paragraph)
  return paragraph.split(". ")[0] if ". " in paragraph else paragraph
def wrap_last_paragraph(story):
  if "\n" in story:
    return (story.rsplit("\n", 1)[0] + "\n" +
            '\n'.join(textwrap.wrap(story.rsplit("\n", 1)[1])))
  else:
    return '\n'.join(textwrap.wrap(story))
def test_if_latex():
  global is_latex
  is_latex = sys.argv[1].endswith(".tex")
  return is_latex

def latex_include_files(lines_in):
  file_in = sys.argv[1]
  dir = file_in.rsplit("/", 1)[0]  if "/" in file_in else "."
  lines_in = [line.strip() for line in lines_in]
  lines_out = []
  for line in lines_in:
    if line.replace(" ", "").startswith("\\input{"):
      file = dir + "/" + line.split("{")[1].split("}")[0]
      if not file.endswith(".tex"):
        file = file + ".tex"
      lines_out += open(file, "r").readlines()
    else:
      lines_out.append(line)
  if [True for line in lines_out if line.replace(" ", "").startswith("\\input{")]:
    lines_out = latex_include_files(lines_out)
  return lines_out

def latex_exclude(lines_in):
  lines_out = []
  exclude_line = False
  arg = "" # It is used only if 'exclude_line == True'
  for line in lines_in:
    if not exclude_line:
      if line.lstrip().startswith("\\bibliographystyle{") or \
         line.lstrip().startswith("\\bibliography{"):
        continue
    if not exclude_line and line.replace(" ", "").startswith("\\begin{"):
      arg = line.split("{")[1].split("}")[0]
      if arg in ["table", "figure", "table*", "figure*"] + \
                ["itemize", "enumerate", "description"]:
        exclude_line = True
    if not exclude_line:
      lines_out.append(line)
    if exclude_line and line.replace(" ", "").startswith("\\end{"+arg+"}"):
        exclude_line = False
  return lines_out

# NOT USED:  Too complex, and failed experiment
def latex_remove_lists(lines_in):
  lines_out = "\n".join(lines_in)
  for type in ("itemize", "enumerate", "description"):
    lines_out = re.sub(r"\\begin{" + type + r"}[\w\W]*\\end{" + type + "}", "",
                       lines_out)
  return lines_out.split("\n")

def is_end_last_paragraph(line):
  par_pattern = ["\\section", "\\subsection", "\\subsubsection", "\\paragraph"]
  return line=="" or [par for par in par_pattern if line.startswith(par + "{")]

lines = open(sys.argv[1], "r").readlines()
test_if_latex()
if is_latex:
  lines = latex_include_files(lines)
  lines = latex_exclude(lines)
lines = [line.strip() for line in lines]
# if is_latex:
#   lines = latex_remove_lists(lines)
paragraph = ""
newInputParagraph = True
story = ""
if to_markdown:
  story = "---\ntitle: " + latex_title(sys.argv[1]) + \
          "\nnumbersections: true\n...\n\n"


begin_document = False # Not used
begin_paragraphs = False
for line in lines:
  if is_latex and to_markdown and not begin_paragraphs:
    if "\\section{" in line:
      begin_paragraphs = True
    continue
  if is_latex and to_latex and not begin_paragraphs:
    story += line + "\n"
    if not begin_document:
      if "\\begin{document}" in line:
        begin_document = True
    if "\\section{" in line:
      begin_paragraphs = True
    continue

  if is_latex and '%' in line and '\%' not in line:
    line = line[:line.find('%')]+"\\relax"  # Bare newline would be new parag.
  # If end of paragraph, save topic sentence.
  if paragraph != "" and (line.startswith('#') or line == ""):
    story += topic_sentence(paragraph) # end of sentence
    paragraph = ""
    continue
  # If end of section, wrap topic sentences into a last paragraph,
  # and print the new section title.
  if (line.startswith('#') or
      line.startswith('\\section') or line.startswith('\\subsection') or
      line.startswith('\\subsubsection')):
    story = wrap_last_paragraph(story)
    orig_line = line
    if is_latex and to_latex:
      story += "\n\n" + line + "\n"
    elif to_markdown:
      line = line.replace('\\section*', '# ').replace('\\subsection*', "## ") \
                 .replace('\\subsubsection*', "### ")
      line = line.replace('\\section', '# ').replace('\\subsection', "## ") \
                 .replace('\\subsubsection', "### ")
      story = story.rstrip()
      line = line.strip().replace("# {", "# ").rstrip('}')
      if line.endswith('}') or '{' not in line:
        story += "\n\n" + line + "\n\n"
      else:
        # FIXME: This is a hack!
        story += "\n\n" + line.split("}", 1)[0] + "\n\n" + \
                 topic_sentence(line.split("}", 1)[1]) + "\n"
    else:
      line = (line.lstrip("#") if line.startswith('#')
                               else line[line.index("section")+len("section"):])
      story = story.rstrip() + ("\n\nSection " + section_num(orig_line) + ": " +
                                line.strip() + "\n\n    ")
    continue
  if line.startswith('====') or line.startswith('----'):
    pass # Remove markdown commands for section/subsection headings
  if is_end_last_paragraph(line):
    newInputParagraph = True
    continue
  paragraph += line.strip() + " "
  if newInputParagraph:
    newInputParagraph = False

if paragraph != "":
  story += topic_sentence(paragraph) # end of sentence
story = wrap_last_paragraph(story)
# if is_latex and to_latex:
#   story += "\n\\end{document}\n"

out_filename = sys.argv[1][:sys.argv[1].rindex(".")] + "-summary"
if not use_pandoc:
  out_filename += ".md" if to_markdown else ".tex" if to_latex else ".txt"
  out_file = open(out_filename, "w")
  out_file.write(story)
  out_file.close()

if use_pandoc:
  filetype = use_pandoc[0] if use_pandoc[0] != "--pdf" else "tex"
  out_filename += "." + filetype.lstrip("-")
  if is_latex and use_pandoc[0] == "--pdf":
    # If output is .pdf, save .tex now, and then use latex.
    cmd = "cat - > " + out_filename
  else: # latex and markdown inputs have both been converted to markdown
    cmd = "pandoc -f markdown -s -o " + out_filename
  if sys.version_info.major < 3 or sys.version_info.minor < 6:
    print("Please use python3.7 (preferred) or python3.6")
    sys.exit(1)
  else:
    try:
      if sys.version_info.minor < 7:
        cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", input=story,
                                check=True, stdout=subprocess.PIPE, timeout=300)
      else:
        cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", input=story,
                                 capture_output=True, timeout=300)
      if cmd_out.returncode != 0:
        print(cmd_out.stdout)
        print(cmd_out.stderr)
        out_filename = None
    except FileNotFoundError:
      print("pandoc command not found.  Please install 'pandoc'.")
      sys.exit(1)
    except subprocess.CalledProcessError:
      sys.exit(1)

if out_filename.endswith(".tex") and use_pandoc and use_pandoc[0] == "--pdf":
  cmd = "pdflatex " + out_filename
  try:
    print("\n" + "*"*60 + "\n*** " + sys.argv[0] + ": " + cmd)
    cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", timeout=300)
    if cmd_out.returncode != 0:
      print("*** (error encountered in: " + cmd + ")")
    print("*"*60 + "\n")
  except FileNotFoundError:
    print("Stopped at " + out_filename + "; latex command not found")
    sys.exit(1)
  os.remove(out_filename)
  out_filename = out_filename.rsplit(".tex", 1)[0] + ".pdf"
  os.rename(out_filename.rsplit("/")[-1], out_filename)
  os.rename(out_filename, out_filename.replace("-summary", "-SUMMARY"))
  subprocess.run("rm main-summary.*", shell=True, encoding="UTF-8", timeout=30)
  os.rename(out_filename.replace("-summary", "-SUMMARY"), out_filename)

if out_filename:
  assert(os.path.exists(out_filename))
  print("The file " + out_filename + " was written.")