#!/usr/bin/python ### FILE: topic-sentences.py (Version 2.0) ### Copyright (c) 2023 by Gene Cooperman (gene@ccs.neu.edu) ### This file may be freely copied and modified as long as ### this copyright notice remains. ### With this script, you can now just write whatever you find interesting. ### Don't worry about filling in a formal outline. ### ### The outline is now the _output_ of your text -- not the input. Instead of ### outlining and then writing, you can now just write, and then read this ### automatically generated outline to see if your "story" sounds right. ### If the "story" doesn't sound right, then that's a "bug" in your writing ### (similar to a "bug" in a computer program). So, the outline shows you the ### location of the "bug", and it should then be easy to go back and revise ### your writing to fix it. And after that, you can automatically regenerate ### the outline to check that the bug has been fixed. ### FIXME: The pandoc command as a filter for '-t pdf': ### To create a pdf using pandoc, use -t latex|beamer|context|ms|html5 ### and specify an output file with .pdf extension (-o filename.pdf). ### FIXME: When we create --markdown, the .md retains definitions like '\name' ### So, then when converting back to latex, we have to fix that, ### or else put back the "\def" lines from the user. ### FIXME: pdf extraction? https://stackoverflow.com/questions/42093548/splitting-pdf-files-into-paragraphs ### https://stackoverflow.com/questions/61540569/how-to-reorganize-a-pdf-file-into-paragraph ### https://stackoverflow.com/questions/76020709/detecting-paragraphs-in-a-pdf import sys import os import textwrap import subprocess import re # If true, we either output markdown or else use pandoc on it. is_latex = [arg for arg in sys.argv if arg.endswith(".tex")] to_latex = {"--pdf", "--latex"}.intersection(sys.argv) to_markdown = {"--markdown", "--html", "--pdf", "--latex"} if is_latex: to_markdown -= {"--latex", "--pdf"} # latex to latex/pdf doesn't use pandoc to_markdown = to_markdown.intersection(sys.argv) out_flags = {"--markdown", "--html", "--text", "--latex", "--pdf"} out_args = set(sys.argv) - out_flags if len(out_args) != 2 or not out_flags.intersection(sys.argv): print("USAGE: " + sys.argv[0] + " [" + "|".join(sorted(out_flags)) + "]" " \n") sys.exit(1) # ARGS: --text, --html, --pdf, --markdown, --latex # Remove "--text", since it's the default. --markdown doesn't use pandoc. use_pandoc = [arg.replace("latex", "tex") for arg in sys.argv if arg.startswith("-") and arg not in ["--text", "--markdown"]] sys.argv = [arg for arg in sys.argv if not arg.startswith("-")] def latex_title(file): title = "" latex_file = open(file, "r") latex_code = latex_file.read() latex_file.close() if "\\title{" in latex_code: title = latex_code[latex_code.index("\\title{")+len("\\title{"):] \ .split("}", 1)[0] return title ## FIXME: Also support '###' or general case. section = "0.0.0" def section_num(line): global section sect = [int(x) for x in section.split('.')] level = len(line) - len(line.lstrip('#')) if level == 0 and line[0].startswith("\\\section"): is_latex = True level = 1 elif level == 0 and line[0].startswith("\\\subsection"): is_latex = True level = 2 if level > 0: sect[level-1] += 1 for i in range(level, len(sect)): sect[i] = 0 section = '.'.join([str(x) for x in sect]) output = section while output.endswith(".0"): output = output[:-2] return output def topic_sentence(paragraph): # A paragraph could begin with " \begin{itemize}". If so, remove it. if paragraph.lstrip().startswith("\\begin"): paragraph = "" paragraph = paragraph.replace("et. al.", "et al").replace("et al.", "et al") pattern = r"(\b[A-Z][a-z]+)\." while re.search(pattern, paragraph): paragraph = re.sub(pattern, r"\1", paragraph) return paragraph.split(". ")[0] if ". " in paragraph else paragraph def wrap_last_paragraph(story): if "\n" in story: return (story.rsplit("\n", 1)[0] + "\n" + '\n'.join(textwrap.wrap(story.rsplit("\n", 1)[1]))) else: return '\n'.join(textwrap.wrap(story)) def test_if_latex(): global is_latex is_latex = sys.argv[1].endswith(".tex") return is_latex def latex_include_files(lines_in): file_in = sys.argv[1] dir = file_in.rsplit("/", 1)[0] if "/" in file_in else "." lines_in = [line.strip() for line in lines_in] lines_out = [] for line in lines_in: if line.replace(" ", "").startswith("\\input{"): file = dir + "/" + line.split("{")[1].split("}")[0] if not file.endswith(".tex"): file = file + ".tex" lines_out += open(file, "r").readlines() else: lines_out.append(line) if [True for line in lines_out if line.replace(" ", "").startswith("\\input{")]: lines_out = latex_include_files(lines_out) return lines_out def latex_exclude(lines_in): lines_out = [] exclude_line = False arg = "" # It is used only if 'exclude_line == True' for line in lines_in: if not exclude_line: if line.lstrip().startswith("\\bibliographystyle{") or \ line.lstrip().startswith("\\bibliography{"): continue if not exclude_line and line.replace(" ", "").startswith("\\begin{"): arg = line.split("{")[1].split("}")[0] if arg in ["table", "figure", "table*", "figure*"] + \ ["itemize", "enumerate", "description"]: exclude_line = True if not exclude_line: lines_out.append(line) if exclude_line and line.replace(" ", "").startswith("\\end{"+arg+"}"): exclude_line = False return lines_out # NOT USED: Too complex, and failed experiment def latex_remove_lists(lines_in): lines_out = "\n".join(lines_in) for type in ("itemize", "enumerate", "description"): lines_out = re.sub(r"\\begin{" + type + r"}[\w\W]*\\end{" + type + "}", "", lines_out) return lines_out.split("\n") def is_end_last_paragraph(line): par_pattern = ["\\section", "\\subsection", "\\subsubsection", "\\paragraph"] return line=="" or [par for par in par_pattern if line.startswith(par + "{")] lines = open(sys.argv[1], "r").readlines() test_if_latex() if is_latex: lines = latex_include_files(lines) lines = latex_exclude(lines) lines = [line.strip() for line in lines] # if is_latex: # lines = latex_remove_lists(lines) paragraph = "" newInputParagraph = True story = "" if to_markdown: story = "---\ntitle: " + latex_title(sys.argv[1]) + \ "\nnumbersections: true\n...\n\n" begin_document = False # Not used begin_paragraphs = False for line in lines: if is_latex and to_markdown and not begin_paragraphs: if "\\section{" in line: begin_paragraphs = True continue if is_latex and to_latex and not begin_paragraphs: story += line + "\n" if not begin_document: if "\\begin{document}" in line: begin_document = True if "\\section{" in line: begin_paragraphs = True continue if is_latex and '%' in line and '\%' not in line: line = line[:line.find('%')]+"\\relax" # Bare newline would be new parag. # If end of paragraph, save topic sentence. if paragraph != "" and (line.startswith('#') or line == ""): story += topic_sentence(paragraph) # end of sentence paragraph = "" continue # If end of section, wrap topic sentences into a last paragraph, # and print the new section title. if (line.startswith('#') or line.startswith('\\section') or line.startswith('\\subsection') or line.startswith('\\subsubsection')): story = wrap_last_paragraph(story) orig_line = line if is_latex and to_latex: story += "\n\n" + line + "\n" elif to_markdown: line = line.replace('\\section*', '# ').replace('\\subsection*', "## ") \ .replace('\\subsubsection*', "### ") line = line.replace('\\section', '# ').replace('\\subsection', "## ") \ .replace('\\subsubsection', "### ") story = story.rstrip() line = line.strip().replace("# {", "# ").rstrip('}') if line.endswith('}') or '{' not in line: story += "\n\n" + line + "\n\n" else: # FIXME: This is a hack! story += "\n\n" + line.split("}", 1)[0] + "\n\n" + \ topic_sentence(line.split("}", 1)[1]) + "\n" else: line = (line.lstrip("#") if line.startswith('#') else line[line.index("section")+len("section"):]) story = story.rstrip() + ("\n\nSection " + section_num(orig_line) + ": " + line.strip() + "\n\n ") continue if line.startswith('====') or line.startswith('----'): pass # Remove markdown commands for section/subsection headings if is_end_last_paragraph(line): newInputParagraph = True continue paragraph += line.strip() + " " if newInputParagraph: newInputParagraph = False if paragraph != "": story += topic_sentence(paragraph) # end of sentence story = wrap_last_paragraph(story) # if is_latex and to_latex: # story += "\n\\end{document}\n" out_filename = sys.argv[1][:sys.argv[1].rindex(".")] + "-summary" if not use_pandoc: out_filename += ".md" if to_markdown else ".tex" if to_latex else ".txt" out_file = open(out_filename, "w") out_file.write(story) out_file.close() if use_pandoc: filetype = use_pandoc[0] if use_pandoc[0] != "--pdf" else "tex" out_filename += "." + filetype.lstrip("-") if is_latex and use_pandoc[0] == "--pdf": # If output is .pdf, save .tex now, and then use latex. cmd = "cat - > " + out_filename else: # latex and markdown inputs have both been converted to markdown cmd = "pandoc -f markdown -s -o " + out_filename if sys.version_info.major < 3 or sys.version_info.minor < 6: print("Please use python3.7 (preferred) or python3.6") sys.exit(1) else: try: if sys.version_info.minor < 7: cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", input=story, check=True, stdout=subprocess.PIPE, timeout=300) else: cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", input=story, capture_output=True, timeout=300) if cmd_out.returncode != 0: print(cmd_out.stdout) print(cmd_out.stderr) out_filename = None except FileNotFoundError: print("pandoc command not found. Please install 'pandoc'.") sys.exit(1) except subprocess.CalledProcessError: sys.exit(1) if out_filename.endswith(".tex") and use_pandoc and use_pandoc[0] == "--pdf": cmd = "pdflatex " + out_filename try: print("\n" + "*"*60 + "\n*** " + sys.argv[0] + ": " + cmd) cmd_out = subprocess.run(cmd, shell=True, encoding="UTF-8", timeout=300) if cmd_out.returncode != 0: print("*** (error encountered in: " + cmd + ")") print("*"*60 + "\n") except FileNotFoundError: print("Stopped at " + out_filename + "; latex command not found") sys.exit(1) os.remove(out_filename) out_filename = out_filename.rsplit(".tex", 1)[0] + ".pdf" os.rename(out_filename.rsplit("/")[-1], out_filename) os.rename(out_filename, out_filename.replace("-summary", "-SUMMARY")) subprocess.run("rm main-summary.*", shell=True, encoding="UTF-8", timeout=30) os.rename(out_filename.replace("-summary", "-SUMMARY"), out_filename) if out_filename: assert(os.path.exists(out_filename)) print("The file " + out_filename + " was written.")