textfiles-politics/pythonCode/personTagger.py

823 lines
44 KiB
Python

import spacy
from collections import Counter
import re as regex
import os
from saxonche import PySaxonProcessor
#### Loads all of the necessary variables and functions.
nlp = spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
#########################################################################################
# ebb: After reading the NLP output, we know spaCy is making some mistakes.
# So, here let's try adding an EntityRuler to customize spaCy's classification. We need
# to configure this BEFORE we send the tokens off to nlp() for processing.
##########################################################################################
# Create the EntityRuler and set it so the ner comes after, so OUR rules take precedence
# Sources:
# W. J. B. Mattingly: https://ner.pythonhumanities.com/02_01_spaCy_Entity_Ruler.html
# spaCy documentation on NER Entity Ruler: https://spacy.io/usage/rule-based-matching#entityruler
config = {"spans_key": None, "annotate_ents": True, "overwrite": True, "validate": True}
ruler = nlp.add_pipe("span_ruler", before="ner", config=config)
# 2023-04-07: ebb: NOTE: before="ner" setting seems to allow the spaCy NER rules to prevail over these patterns where
# there is a conflict.
# after="ner" means that the spaCy NER is TOTALLY OVERWRITTEN and invalidated by our patterns.
# Notes: Mattingly has this: ruler = nlp.add_pipe("entity_ruler", after="ner", config={"validate": True})
# But this only works when spaCy doesn't recognize a word / phrase as a named entity of any kind.
# If it recognizes a named entity but tags it wrong, we correct it with the span_ruler, not the entity_ruler
patterns = [
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^-\w+?"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\^+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^\w\w$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^[a-z]+\s+[a-z]+$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?__{2,}.*?$"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN\s+NETWORK"}}]},
# ebb: Don't match on any single characters!
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[A-Z]{2,}[A-Z][a-z]+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[a-z]{2,}[A-Z][a-z]+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?[a-z][A-Z].*?$"}}]},
# ebb: Above line attempts to stop matching things like Oak IslandThe Method
{"label": "NULL", "pattern": [{"TEXT" : {"REGEX": "^[Mm\-]+$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\w+cia\w+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "fed\w+"}}]},
# SOCIALISMBY RICHARD
# ebb: Above line attempts to stop matching things Mmm-mm or mm , etc.
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Babylon(ia)?"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Christiani\s*ty"}}]},
{"label": "NULL", "pattern": "Christiani"},
{"label": "NULL", "pattern": "Parallel"},
{"label": "NULL", "pattern": "the user"},
{"label": "NULL", "pattern": "Advanced"},
{"label": "NULL", "pattern": "Believability"},
{"label": "NULL", "pattern": "Onesuch"},
{"label": "NULL", "pattern": "the People"},
{"label": "NULL", "pattern": "REPRINT"},
{"label": "NULL", "pattern": "The Next Banking Crisis"},
{"label": "NULL", "pattern": "Martini Glass"},
{"label": "NULL", "pattern": "the Sheriff"},
{"label": "NULL", "pattern": "Greets"},
{"label": "NULL", "pattern": "Families"},
{"label": "NULL", "pattern": "preparingits"},
{"label": "NULL", "pattern": "wintry"},
{"label": "NULL", "pattern": "Interested"},
{"label": "NULL", "pattern": "Time"},
{"label": "NULL", "pattern": "contra"},
{"label": "NULL", "pattern": "Mental Health"},
{"label": "LAW", "pattern": "Bill of Rights"},
{"label": "LAW", "pattern": "Emergency Detention Act"},
{"label": "LAW", "pattern": "Geneva Convention"},
{"label": "LAW", "pattern": "Official Secrets Act"},
{"label": "LAW", "pattern": "Executive Order"},
{"label": "LAW", "pattern": "State Constitution"},
{"label": "LAW", "pattern": "Constitution"},
{"label": "LAW", "pattern": "Martial Law"},
{"label": "LAW", "pattern": "Martial Rule"},
{"label": "LAW", "pattern": "Alaska Mental Health Bill"},
{"label": "LAW", "pattern": "Multilateral Protection of War Victims/Prisoners of War"},
{"label": "LAW", "pattern": "Multilateral Protection of War Victims/Civilian Persons"},
{"label": "LAW", "pattern": "Public Health Service Draft Act"},
{"label": "LAW", "pattern": "Uniform Mental Health Act"},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Executive Order #[0-9]+"}}]},
{"label": "NULL", "pattern": "Median"},
{"label": "NULL", "pattern": "Next"},
{"label": "NULL", "pattern": "Daily"},
{"label": "NULL", "pattern": "Justice"},
{"label": "NULL", "pattern": "pro tem"},
{"label": "NULL", "pattern": "megs"},
{"label": "NULL", "pattern": "the Kingdom"},
{"label": "ORG", "pattern": "The Office of Strategic Services"},
{"label": "NULL", "pattern": "di"},
{"label": "NULL", "pattern": "econonic aid"},
{"label": "NULL", "pattern": "fed"},
{"label": "NULL", "pattern": "the Temple"},
{"label": "NULL", "pattern": "Said"},
{"label": "NULL", "pattern": "Cheez Whiz"},
{"label": "NULL", "pattern": "the Rich Discover Worthy"},
{"label": "NULL", "pattern": "Examiner"},
{"label": "NULL", "pattern": "msen"},
{"label": "NULL", "pattern": "ORG"},
{"label": "NULL", "pattern": "Physics A. Mathematical"},
{"label": "NULL", "pattern": "PALE"},
{"label": "NULL", "pattern": "Order"},
{"label": "NULL", "pattern": "Command"},
{"label": "NULL", "pattern": "jackboots"},
{"label": "NULL", "pattern": "Human Behavior"},
{"label": "NULL", "pattern": "SINCE LAWYERS OCCUPY"},
{"label": "WORK_OF_ART", "pattern": "Digha Nikaya"},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "(Ludwig [Vv]an )?Beethoven"}}]},
{"label": "ORG", "pattern": "Falangist"},
{"label": "ORG", "pattern": "Congressional committee"},
{"label": "ORG", "pattern": "The Federal Bureau of Prisons"},
{"label": "ORG", "pattern": "American Red Cross"},
{"label": "ORG", "pattern": "Annals of Internal Medicine"},
{"label": "ORG", "pattern": "Houston Post"},
{"label": "ORG", "pattern": "Houston Chronicle"},
{"label": "ORG", "pattern": "Concentration Camp Program"},
{"label": "ORG", "pattern": "Operation Garden Plot"},
{"label": "ORG", "pattern": "federal government"},
{"label": "ORG", "pattern": "British intelligenc"},
{"label": "ORG", "pattern": "Composite Service Organization"},
{"label": "ORG", "pattern": "Psychological Operations Organization"},
{"label": "ORG", "pattern": "Council on Foreign Relations"},
{"label": "ORG", "pattern": "Dept. of Defense"},
{"label": "ORG", "pattern": "Mental Health Institution"},
{"label": "ORG", "pattern": "Dept. of Transportation"},
{"label": "ORG", "pattern": "Dept. of Justice"},
{"label": "ORG", "pattern": "L.E.A.F."},
{"label": "ORG", "pattern": "C.I.A."},
{"label": "ORG", "pattern": "J.C. Penney"},
{"label": "ORG", "pattern": "Law Enforcement Assistance Force"},
{"label": "ORG", "pattern": "Young Americans for Freedom"},
{"label": "ORG", "pattern": "Military Police Unit"},
{"label": "ORG", "pattern": "The Annals"},
{"label": "ORG", "pattern": "Inslaw"},
{"label": "ORG", "pattern": "Civil Affairs Operations"},
{"label": "ORG", "pattern": "Civil Affairs Organization"},
{"label": "ORG", "pattern": "Big Brother"},
{"label": "ORG", "pattern": "Big brother"},
{"label": "ORG", "pattern": "State Youthful Offenders Division"},
{"label": "ORG", "pattern": "The California State Bar's Standing Committee on Professional Responsibility and Conduct"},
{"label": "ORG", "pattern": "ILLUMINATI"},
{"label": "ORG", "pattern": "Alaska Bar Association"},
{"label": "ORG", "pattern": "University of Wisconsin"},
{"label": "ORG", "pattern": "University of Southern California"},
{"label": "ORG", "pattern": "ROTC"},
{"label": "ORG", "pattern": "Plunge"},
{"label": "ORG", "pattern": "Los Angeles Sheriff's Dept."},
{"label": "ORG", "pattern": "U.N. Security Council"},
{"label": "ORG", "pattern": "U.N."},
{"label": "ORG", "pattern": "Library of Congress"},
{"label": "ORG", "pattern": "International Congress on Mental health"},
{"label": "ORG", "pattern": "U. S . Secret Service"},
{"label": "ORG", "pattern": "Office of Thrift Supervision"},
{"label": "ORG", "pattern": "Comptroller of the Currency"},
{"label": "ORG", "pattern": "Urban Plunge"},
{"label": "ORG", "pattern": "State Department"},
{"label": "ORG", "pattern": "Dept. of State"},
{"label": "ORG", "pattern": "G.S.A."},
{"label": "ORG", "pattern": "State Dept."},
{"label": "ORG", "pattern": "California National Guard"},
{"label": "ORG", "pattern": "H.U.D."},
{"label": "ORG", "pattern": "H.E.W."},
{"label": "ORG", "pattern": "law enforcement"},
{"label": "ORG", "pattern": "Housing & Urban Development"},
{"label": "ORG", "pattern": "Dept. of Education"},
{"label": "NORP", "pattern": "Dropa"},
{"label": "NORP", "pattern": "Viet Cong"},
{"label": "NORP", "pattern": "Egyptian"},
{"label": "NORP", "pattern": "Lybian"},
{"label": "NORP", "pattern": "Cuban"},
{"label": "NORP", "pattern": "Japanese"},
{"label": "NORP", "pattern": "Nicaraguan"},
{"label": "NORP", "pattern": "African"},
{"label": "NORP", "pattern": "Indian"},
{"label": "NORP", "pattern": "Icelandic"},
{"label": "NORP", "pattern": "Russian"},
{"label": "NORP", "pattern": "Clandestinism"},
{"label": "NORP", "pattern": "Trilateralists"},
{"label": "NORP", "pattern": "Spaniard"},
{"label": "LOC", "pattern": "Vietnam Moratorium"},
{"label": "LOC", "pattern": "Paddington station"},
{"label": "LOC", "pattern": "David Munson Air Base"},
{"label": "LOC", "pattern": "Southeast Asia"},
{"label": "LOC", "pattern": "Eielson Air Force Base"},
{"label": "LOC", "pattern": "U.S. Army Reserves"},
{"label": "LOC", "pattern": "Bay of Pigs"},
{"label": "LOC", "pattern": "US Air Force L. Fletcher Prouty"},
{"label": "LOC", "pattern": "Bohemian Grove"},
{"label": "LOC", "pattern": "Broadway"},
{"label": "NORP", "pattern": "North American"},
{"label": "NULL", "pattern": "Northern"},
{"label": "GPE", "pattern": "Allanwood"},
{"label": "GPE", "pattern": "Westminster"},
{"label": "GPE", "pattern": "Portland"},
{"label": "GPE", "pattern": "Richmond"},
{"label": "GPE", "pattern": "Sacramento"},
{"label": "GPE", "pattern": "St Louis"},
{"label": "GPE", "pattern": "New Haven"},
{"label": "GPE", "pattern": "Milwaukee"},
{"label": "GPE", "pattern": "Little Rock"},
{"label": "GPE", "pattern": "Los Angeles"},
{"label": "GPE", "pattern": "El Paso"},
{"label": "GPE", "pattern": "Columbia"},
{"label": "GPE", "pattern": "Butte"},
{"label": "GPE", "pattern": "State College"},
{"label": "GPE", "pattern": "Williamsport"},
{"label": "GPE", "pattern": "Landsdale"},
{"label": "GPE", "pattern": "Newtown Square"},
{"label": "GPE", "pattern": "Allentown"},
{"label": "GPE", "pattern": "New Castle"},
{"label": "GPE", "pattern": "Beckley"},
{"label": "GPE", "pattern": "Alton"},
{"label": "GPE", "pattern": "Stubenville"},
{"label": "GPE", "pattern": "Buffalo"},
{"label": "GPE", "pattern": "Belleville"},
{"label": "GPE", "pattern": "Bloomington"},
{"label": "GPE", "pattern": "Carbondale"},
{"label": "GPE", "pattern": "Champaign"},
{"label": "GPE", "pattern": "Rock Island"},
{"label": "GPE", "pattern": "Birmingham"},
{"label": "GPE", "pattern": "Baltimore"},
{"label": "GPE", "pattern": "Anchorage"},
{"label": "GPE", "pattern": "Albuquerque"},
{"label": "GPE", "pattern": "Albany"},
{"label": "GPE", "pattern": "Washington DC"},
{"label": "GPE", "pattern": "Washington, D.C."},
{"label": "GPE", "pattern": "Avon Park"},
{"label": "GPE", "pattern": "Mill Point"},
{"label": "GPE", "pattern": "El Reno"},
{"label": "GPE", "pattern": "Zagreb"},
{"label": "GPE", "pattern": "ZAGREB"},
{"label": "LOC", "pattern": "Whiskey Flat"},
{"label": "LOC", "pattern": "El Salvador air base"},
{"label": "LOC", "pattern": "Greenville County"},
{"label": "LOC", "pattern": "Toyland"},
{"label": "LOC", "pattern": "Owl's Nest"},
{"label": "LOC", "pattern": "Dachau"},
{"label": "LOC", "pattern": "Hill Billies"},
{"label": "LOC", "pattern": "Cave Man's"},
{"label": "GPE", "pattern": "Nazareth"},
{"label": "GPE", "pattern": "Latin America"},
{"label": "GPE", "pattern": "U.S.S.R"},
{"label": "GPE", "pattern": "New York City"},
{"label": "GPE", "pattern": "Soviet Union"},
{"label": "GPE", "pattern": "Viet Nam"},
{"label": "GPE", "pattern": "U.S.A."},
{"label": "GPE", "pattern": "U.S."},
{"label": "GPE", "pattern": "U.S"},
{"label": "GPE", "pattern": "Saudi Arabia"},
{"label": "GPE", "pattern": "Washington, DC"},
{"label": "GPE", "pattern": "the Vatican"},
{"label": "GPE", "pattern": "British Empire"},
{"label": "GPE", "pattern": "Pennsylvania"},
{"label": "ORG", "pattern": "Warren Commission"},
{"label": "ORG", "pattern": "Hahn group"},
{"label": "ORG", "pattern": "U.S. Department of Health and Human Services"},
{"label": "ORG", "pattern": "American Council of Life Insurance"},
{"label": "ORG", "pattern": "Health Insurance Association of America."},
{"label": "ORG", "pattern": "U.S. District Court"},
{"label": "ORG", "pattern": "National Guard"},
{"label": "ORG", "pattern": "US Army"},
{"label": "ORG", "pattern": "London Underground"},
{"label": "ORG", "pattern": "US Justice Department"},
{"label": "ORG", "pattern": "Sur Coester"},
{"label": "ORG", "pattern": "SoftQuad Inc."},
{"label": "ORG", "pattern": "United Nations World Health Organization"},
{"label": "ORG", "pattern": "L.E.A.A."},
{"label": "ORG", "pattern": "Justice Dept."},
{"label": "ORG", "pattern": "Contemporary Research, Inc."},
{"label": "ORG", "pattern": "British Government"},
{"label": "ORG", "pattern": "Office of Criminal Justice Planning"},
{"label": "ORG", "pattern": "Dept. of Defense of the United States"},
{"label": "ORG", "pattern": "Dept. of Commerce"},
{"label": "ORG", "pattern": "W.A. Harriman & Co."},
{"label": "ORG", "pattern": "Securities C. Commission"},
{"label": "ORG", "pattern": "Dallas Citizens Council"},
{"label": "ORG", "pattern": "Round Table groups"},
{"label": "ORG", "pattern": "The U.S. Agriculture Department"},
{"label": "ORG", "pattern": "Perpetual Hidden Government"},
{"label": "ORG", "pattern": "Office of the Deputy Chief of Staff of Personnel"},
{"label": "ORG", "pattern": "Harriman Fifteen Corp."},
{"label": "ORG", "pattern": "Israeli secret service"},
{"label": "ORG", "pattern": "Bell Helicopter"},
{"label": "ORG", "pattern": "Eurocheque"},
{"label": "ORG", "pattern": "MasterCard"},
{"label": "ORG", "pattern": "CitiCorp"},
{"label": "ORG", "pattern": "Taymar, Inc."},
{"label": "ORG", "pattern": "Kerr McGree Nuclear Corporation"},
{"label": "ORG", "pattern": "Edward R. Murrow Center"},
{"label": "ORG", "pattern": "American Ship and Commerce Corp."},
{"label": "ORG", "pattern": "American government"},
{"label": "ORG", "pattern": "American Association for the Advancement of Science"},
{"label": "ORG", "pattern": "G.H. Walker & Co."},
{"label": "ORG", "pattern": "Office of Preparedness"},
{"label": "ORG", "pattern": "John Birch Society"},
{"label": "ORG", "pattern": "U.S. government"},
{"label": "ORG", "pattern": "Cleveland Field Office"},
{"label": "ORG", "pattern": "The John F. Kennedy Center for the Performing Arts Education Program"},
{"label": "ORG", "pattern": "United States government"},
{"label": "ORG", "pattern": "National Institute of Agrarian Reform"},
{"label": "ORG", "pattern": "National Institute of Mental Health"},
{"label": "ORG", "pattern": "INRA"},
{"label": "ORG", "pattern": "Dept. of the Army"},
{"label": "ORG", "pattern": "Criminal Justice System"},
{"label": "ORG", "pattern": "Sheriffs Dept."},
{"label": "ORG", "pattern": "L.E.A.A. Newsletter"},
{"label": "ORG", "pattern": "Texas School Book Depository"},
{"label": "ORG", "pattern": "Senate Select Committee on Intelligence"},
{"label": "ORG", "pattern": "House Committee on Assassinations"},
{"label": "ORG", "pattern": "House of Saud"},
{"label": "ORG", "pattern": "Federal Emergency Management Agency"},
{"label": "ORG", "pattern": "Special Forces Unit"},
{"label": "ORG", "pattern": "Defense Central Index"},
{"label": "ORG", "pattern": "Joint Chiefs of Staff"},
{"label": "ORG", "pattern": "Federal Information Center"},
{"label": "ORG", "pattern": "American Counter-intelligence Corps"},
{"label": "ORG", "pattern": "CIC"},
{"label": "ORG", "pattern": "British Establishment."},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+([A-Z]\.\s+[A-Z][a-z]+)?(,\s+Jr\.)"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+([A-Z]\.\s+[A-Z][a-z]+)?(\s+Jr\.)"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+(([A-Z]\.\s+)?[A-Z][a-z]+([A-Z][a-z]+)?)?\s+Jr\."}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[[A-Z][a-z]+\s+[A-Z]\.\s+([A-Z][a-z])?[A-Z][a-z]+"}}]},
{"label": "PERSON", "pattern": "Nostradameus"},
{"label": "PERSON", "pattern": "Harrison Edward Livingstone"},
{"label": "PERSON", "pattern": "J. Herbert Sawyer"},
{"label": "PERSON", "pattern": "L. Harmon Zeigler"},
{"label": "PERSON", "pattern": "C. Everett Koop"},
{"label": "PERSON", "pattern": "Edward L. van Roden"},
{"label": "PERSON", "pattern": "Kirschbaum"},
{"label": "PERSON", "pattern": "Raphael Shumacker"},
{"label": "PERSON", "pattern": "DAVID MELLOR"},
{"label": "PERSON", "pattern": "J. W. Willmott"},
{"label": "PERSON", "pattern": "Morris Ellowitz"},
{"label": "PERSON", "pattern": "Harry Thon"},
{"label": "PERSON", "pattern": "Locust"},
{"label": "PERSON", "pattern": "Nuri Al-Said"},
{"label": "PERSON", "pattern": "Faisal II"},
{"label": "PERSON", "pattern": "Abdul Llah"},
{"label": "PERSON", "pattern": "John Paul I"},
{"label": "PERSON", "pattern": "John Paul II"},
{"label": "PERSON", "pattern": "Rockefeller III"},
{"label": "PERSON", "pattern": "John D. Rockefeller III"},
{"label": "PERSON", "pattern": "John D. Rockefeller IV"},
{"label": "PERSON", "pattern": "William H. Draper III"},
{"label": "PERSON", "pattern": "Eduardo"},
{"label": "PERSON", "pattern": "G.H. Walker III"},
{"label": "PERSON", "pattern": "Daniel Gooch"},
{"label": "PERSON", "pattern": "Ramses II"},
{"label": "PERSON", "pattern": "Alexander II"},
{"label": "PERSON", "pattern": "Alexander V"},
{"label": "PERSON", "pattern": "Alexander IV"},
{"label": "PERSON", "pattern": "Alexander VI"},
{"label": "PERSON", "pattern": "CARROLL QUIGLEY"},
{"label": "PERSON", "pattern": "James I"},
{"label": "PERSON", "pattern": "Oswald II"},
{"label": "PERSON", "pattern": "Oswald I"},
{"label": "PERSON", "pattern": "Umberto I"},
{"label": "PERSON", "pattern": "C. Hamilton Ellis"},
{"label": "PERSON", "pattern": "Alfred Ely Beach"},
{"label": "PERSON", "pattern": "Beach"},
{"label": "PERSON", "pattern": "Lord Milner"},
{"label": "PERSON", "pattern": "Agapetus I"},
{"label": "PERSON", "pattern": "Agapetus II"},
{"label": "PERSON", "pattern": "C. Victor Raiser II"},
{"label": "PERSON", "pattern": "C.V. Raiser II"},
{"label": "PERSON", "pattern": "Boniface VIII"},
{"label": "PERSON", "pattern": "Boniface VII"},
{"label": "PERSON", "pattern": "Boniface III"},
{"label": "PERSON", "pattern": "Boniface II"},
{"label": "PERSON", "pattern": "Boniface VI"},
{"label": "PERSON", "pattern": "Gonda"},
{"label": "PERSON", "pattern": "Richard II"},
{"label": "PERSON", "pattern": "Richard III"},
{"label": "PERSON", "pattern": "George Terwilliger III"},
{"label": "PERSON", "pattern": "C.E. Koop"},
{"label": "PERSON", "pattern": "Callistus I"},
{"label": "PERSON", "pattern": "Francis von Hapsburg"},
{"label": "PERSON", "pattern": "Malcom X"},
{"label": "PERSON", "pattern": "Victor IV"},
{"label": "PERSON", "pattern": "Carles C. Messick III"},
{"label": "PERSON", "pattern": "John D. Rockefeller IV"},
{"label": "PERSON", "pattern": "MARCONI"},
{"label": "PERSON", "pattern": "Marconi"},
{"label": "PERSON", "pattern": "Charles W. Bailey II"},
{"label": "PERSON", "pattern": "Charles I"},
{"label": "PERSON", "pattern": "Charles McKee"},
{"label": "PERSON", "pattern": "Charles X"},
{"label": "PERSON", "pattern": "Charles V"},
{"label": "PERSON", "pattern": "Jerome I"},
{"label": "PERSON", "pattern": "Mark I"},
{"label": "PERSON", "pattern": "John XXIII"},
{"label": "PERSON", "pattern": "Ferdinand I"},
{"label": "PERSON", "pattern": "Nicholas I"},
{"label": "PERSON", "pattern": "Nicholas II"},
{"label": "PERSON", "pattern": "Nicholas V"},
{"label": "PERSON", "pattern": "John Zajac"},
{"label": "PERSON", "pattern": "Sylvester I"},
{"label": "PERSON", "pattern": "Sylvester II"},
{"label": "PERSON", "pattern": "Engenius IV"},
{"label": "PERSON", "pattern": "Philip IV"},
{"label": "PERSON", "pattern": "Stephen II"},
{"label": "PERSON", "pattern": "Stephen VI"},
{"label": "PERSON", "pattern": "Pious X"},
{"label": "PERSON", "pattern": "Honorius II"},
{"label": "PERSON", "pattern": "Sergius III"},
{"label": "PERSON", "pattern": "Adrian I"},
{"label": "PERSON", "pattern": "Adrian VI"},
{"label": "PERSON", "pattern": "William F. Hamilton III"},
{"label": "PERSON", "pattern": "James Baker III"},
{"label": "PERSON", "pattern": "William Stamps Farish III"},
{"label": "PERSON", "pattern": "Elizabeth II"},
{"label": "PERSON", "pattern": "Anacletus II"},
{"label": "PERSON", "pattern": "Edward II"},
{"label": "PERSON", "pattern": "Edward III"},
{"label": "PERSON", "pattern": "Edward V"},
{"label": "PERSON", "pattern": "Matthew XVI"},
{"label": "PERSON", "pattern": "Thothmes III"},
{"label": "PERSON", "pattern": "Harry Hurt III"},
{"label": "PERSON", "pattern": "Napoleon III"},
{"label": "PERSON", "pattern": "Clement VIII"},
{"label": "PERSON", "pattern": "Clement I"},
{"label": "PERSON", "pattern": "Clement VI"},
{"label": "PERSON", "pattern": "Clement IV"},
{"label": "PERSON", "pattern": "Pius X"},
{"label": "PERSON", "pattern": "Pius II"},
{"label": "PERSON", "pattern": "Pius IX"},
{"label": "PERSON", "pattern": "Pius XI"},
{"label": "PERSON", "pattern": "Pius XII"},
{"label": "PERSON", "pattern": "Leo IX"},
{"label": "PERSON", "pattern": "Leo X"},
{"label": "PERSON", "pattern": "Leo III"},
{"label": "PERSON", "pattern": "Leo XIII"},
{"label": "PERSON", "pattern": "Amenhotep IV"},
{"label": "PERSON", "pattern": "Charles W. Bailey II"},
{"label": "PERSON", "pattern": "Constantine"},
{"label": "PERSON", "pattern": "Tut-ankh-amen"},
{"label": "PERSON", "pattern": "Sixtus V"},
{"label": "PERSON", "pattern": "Sixtus IV"},
{"label": "PERSON", "pattern": "Benedict XIV"},
{"label": "PERSON", "pattern": "Benedict VI"},
{"label": "PERSON", "pattern": "Benedict IX"},
{"label": "PERSON", "pattern": "Benedict XV"},
{"label": "PERSON", "pattern": "George I"},
{"label": "PERSON", "pattern": "George III"},
{"label": "PERSON", "pattern": "Gregory VII"},
{"label": "PERSON", "pattern": "Gregory I"},
{"label": "PERSON", "pattern": "Gregory XVI"},
{"label": "PERSON", "pattern": "Gregory IX"},
{"label": "PERSON", "pattern": "Constantine II"},
{"label": "PERSON", "pattern": "Albert I"},
{"label": "PERSON", "pattern": "Albert V. Bryan Jr"},
{"label": "PERSON", "pattern": "Alfonso XII"},
{"label": "PERSON", "pattern": "Alfonso XIII"},
{"label": "PERSON", "pattern": "Gustavus III"},
{"label": "PERSON", "pattern": "Gustav III"},
{"label": "PERSON", "pattern": "Felix III"},
{"label": "PERSON", "pattern": "Valentinian III"},
{"label": "PERSON", "pattern": "Innocent III"},
{"label": "PERSON", "pattern": "Innocent IV"},
{"label": "PERSON", "pattern": "Frederick, II"},
{"label": "PERSON", "pattern": "Frederick II"},
{"label": "PERSON", "pattern": "Theodosius I"},
{"label": "PERSON", "pattern": "Henry VIII"},
{"label": "PERSON", "pattern": "Henry VI"},
{"label": "PERSON", "pattern": "Henry IV"},
{"label": "PERSON", "pattern": "Louis IX"},
{"label": "PERSON", "pattern": "Louis XVI"},
{"label": "PERSON", "pattern": "Joseph II"},
{"label": "PERSON", "pattern": "Catherine II"},
{"label": "PERSON", "pattern": "James D Bryant II"},
{"label": "PERSON", "pattern": "Paul III"},
{"label": "PERSON", "pattern": "Paul II"},
{"label": "PERSON", "pattern": "Julius II"},
{"label": "PERSON", "pattern": "Paul VI"},
{"label": "PERSON", "pattern": "William II"},
{"label": "PERSON", "pattern": "William III"},
{"label": "PERSON", "pattern": "Baldwin II"},
{"label": "PERSON", "pattern": "Urban VIII"},
{"label": "PERSON", "pattern": "Charles Goodhue, III"},
{"label": "PERSON", "pattern": "Edwin Meese, III"},
{"label": "PERSON", "pattern": "Thomas Downing"},
{'label': 'PERSON', 'pattern': 'James "Bo" Gritz'},
{"label": "PERSON", "pattern": "James Earl Ray"},
{"label": "PERSON", "pattern": "S. Rilling"},
{"label": "PERSON", "pattern": "Scott Weekly"},
{"label": "PERSON", "pattern": "Ford, L.H"},
{"label": "PERSON", "pattern": "A. Vilenkin"},
{"label": "PERSON", "pattern": "Renate Viebahn"},
{"label": "PERSON", "pattern": "Lustick, Ian S."},
{"label": "PERSON", "pattern": "Friedman, Robert I."},
{"label": "PERSON", "pattern": "Howard Sprague"},
{"label": "PERSON", "pattern": "Mark Clark"},
{"label": "PERSON", "pattern": "Owen"},
{"label": "PERSON", "pattern": "Theodore Shackley"},
{"label": "PERSON", "pattern": "Clines"},
{"label": "PERSON", "pattern": "L. F. Prouty"},
{"label": "PERSON", "pattern": "L. Fletcher Prouty"},
{"label": "PERSON", "pattern": "Knight, Amy W."},
{"label": "PERSON", "pattern": "Martindale"},
{"label": "PERSON", "pattern": "R. Knox"},
{"label": "PERSON", "pattern": "Leon Oswald"},
{"label": "PERSON", "pattern": "H. Ross Perot"},
{"label": "PERSON", "pattern": "Stanly R. Larsen"},
{"label": "PERSON", "pattern": "L.E. Allen Jr."},
{"label": "PERSON", "pattern": "A. Lewis"},
{"label": "PERSON", "pattern": "D. Fleming"},
{"label": "PERSON", "pattern": "J. Edgar Hoover"},
{"label": "PERSON", "pattern": "V. Pirie"},
{"label": "PERSON", "pattern": "Nancy B."},
{"label": "PERSON", "pattern": "Ronald Reagan"},
{"label": "PERSON", "pattern": "P. Gibbs"},
{"label": "PERSON", "pattern": "W. Citrine"},
{"label": "PERSON", "pattern": "E. Moore"},
{"label": "PERSON", "pattern": "Lyndon Johnson"},
{"label": "PERSON", "pattern": "John Kennedy"},
{"label": "PERSON", "pattern": "Zia ul-Haque"},
{"label": "PERSON", "pattern": "Ronald Payne"},
{"label": "PERSON", "pattern": "R. Muldoon"},
{"label": "PERSON", "pattern": "F. Orr"},
{"label": "PERSON", "pattern": "Frank H. Schwable"},
{"label": "PERSON", "pattern": "R. F. Doyle"},
{"label": "PERSON", "pattern": "David Munson"},
{"label": "PERSON", "pattern": "Frances Clark"},
{"label": "PERSON", "pattern": "G. Gordon Broadbent"},
{"label": "PERSON", "pattern": "J. Segal"},
{"label": "PERSON", "pattern": "F. Brangwyn"},
{"label": "PERSON", "pattern": "A. Luchaire"},
{"label": "PERSON", "pattern": "L. Segal"},
{"label": "PERSON", "pattern": "R. William Davis"},
{"label": "PERSON", "pattern": "William W. Quinn"},
{"label": "PERSON", "pattern": "William R. Pabst"},
{"label": "PERSON", "pattern": "William Pabst"},
{"label": "PERSON", "pattern": "G. Maeferren"},
{"label": "PERSON", "pattern": "Dian-Lanz"},
{"label": "PERSON", "pattern": "Buria"},
{"label": "PERSON", "pattern": "North"},
{"label": "PERSON", "pattern": "Ortho III"},
{"label": "PERSON", "pattern": "Santo Trafficante"},
{"label": "PERSON", "pattern": "Felix Rodreguez"},
{"label": "PERSON", "pattern": "Stanly F. Yoles"},
{"label": "PERSON", "pattern": "G. B. Chisholm"},
{"label": "PERSON", "pattern": "Baron Kurt von Schroeder"},
{"label": "PERSON", "pattern": "George Bush"},
{"label": "PERSON", "pattern": "Roland Harriman"},
{"label": "PERSON", "pattern": "John Foster Dulles"},
{"label": "PERSON", "pattern": "Karen Silkwood"},
{"label": "PERSON", "pattern": "N. McQuire"},
{"label": "PERSON", "pattern": "William Highland"},
{"label": "PERSON", "pattern": "Harriman"},
{"label": "PERSON", "pattern": "William Saxby"},
{"label": "PERSON", "pattern": "Ellis O. Briggs"},
{"label": "PERSON", "pattern": "Daniel Ludwig"},
{"label": "PERSON", "pattern": "Robert C. Klowers"},
{"label": "PERSON", "pattern": "Tatum B. Laird"},
{"label": "PERSON", "pattern": "William F. Buckley, Jr."},
{"label": "PERSON", "pattern": "William Randolph Hearst, Jr."},
{"label": "PERSON", "pattern": "Justin Dart"},
{"label": "PERSON", "pattern": "Larry McDonald"},
{"label": "PERSON", "pattern": "Lievense"},
{"label": "PERSON", "pattern": "CASTRO"},
{"label": "PERSON", "pattern": "Erik Jonsson"},
{"label": "PERSON", "pattern": "Charles E. Allen"},
{"label": "PERSON", "pattern": "Laurence H. Shoup"},
{"label": "PERSON", "pattern": "David Rieff"},
{"label": "PERSON", "pattern": "Diamond"},
{"label": "PERSON", "pattern": "Donald DeFreeze"},
{"label": "PERSON", "pattern": "McGovern"},
{"label": "PERSON", "pattern": "McCarthy"},
{"label": "PERSON", "pattern": "John Connally"},
{"label": "PERSON", "pattern": "Lyndon LaRouche"},
{"label": "PERSON", "pattern": "LaRouche"},
{"label": "PERSON", "pattern": "W.H. Bowart"},
{"label": "PERSON", "pattern": "Lehrman"},
{"label": "PERSON", "pattern": "John McCone"},
{"label": "PERSON", "pattern": "Al Haig"},
{"label": "PERSON", "pattern": "J. Latimer Clark"},
{"label": "PERSON", "pattern": "T. W. Rammell"},
{"label": "PERSON", "pattern": "Elizabeth II"},
{"label": "PERSON", "pattern": "Martha Honey"},
{"label": "PERSON", "pattern": "Vince Bielski"},
{"label": "PERSON", "pattern": "Anton Chaitkin"},
{"label": "PERSON", "pattern": "Dean Burch"},
{"label": "PERSON", "pattern": "Burch"},
{"label": "PERSON", "pattern": "Gaius Caesar"},
{"label": "PERSON", "pattern": "Gaius Chaerea"},
{"label": "PERSON", "pattern": "McCone"},
{"label": "PERSON", "pattern": "Richard Nixon"},
{"label": "PERSON", "pattern": "David Wise"},
{"label": "PERSON", "pattern": "Zapruder"},
{"label": "PERSON", "pattern": "Mortimer J. Adler"},
{"label": "PERSON", "pattern": "Angela Davis"},
{"label": "PERSON", "pattern": "Segals"},
{"label": "PERSON", "pattern": "COLIN WALLACE"},
{"label": "PERSON", "pattern": "Thomas B. Ross"},
{"label": "PERSON", "pattern": "Bowart"},
{"label": "PERSON", "pattern": "Frank H. Schwable"},
{"label": "PERSON", "pattern": "John F. Kennedy"},
{"label": "PERSON", "pattern": "R. J. Biggar"},
{"label": "PERSON", "pattern": "David R. Hunter"},
{"label": "PERSON", "pattern": "Hugh Everett, III"},
{"label": "PERSON", "pattern": "Sigmund Diamond"},
{"label": "PERSON", "pattern": "Montagnier"},
{"label": "PERSON", "pattern": "W. Schmunger"},
{"label": "PERSON", "pattern": "Vince Bielski"},
{"label": "PERSON", "pattern": "Lee Harvey Oswald"},
{"label": "PERSON", "pattern": "Schmunger"},
{"label": "PERSON", "pattern": "Stranglove"},
{"label": "PERSON", "pattern": "James Calcutt"},
{"label": "PERSON", "pattern": "Florentine Giovanni Boccaccio"},
{"label": "PERSON", "pattern": "Prescott Bush"},
{"label": "PERSON", "pattern": "Maxwell"},
{"label": "PERSON", "pattern": "Daniel Schorr"},
{"label": "PERSON", "pattern": "Allard K. Lowenstein"},
{"label": "PERSON", "pattern": "Daniel Patrick Moynihan"},
{"label": "ORG", "pattern": "National Institute of Health"},
{"label": "ORG", "pattern": "Department of Justice"},
{"label": "ORG", "pattern": "Yakuza"},
{"label": "ORG", "pattern": "YAKUZA"},
{"label": "ORG", "pattern": "U.S. Department of Justice"},
{"label": "ORG", "pattern": "FBI Field Office"},
{"label": "ORG", "pattern": "Bureau of Investigation"},
{"label": "ORG", "pattern": "Dallas Citizens Council"},
{"label": "ORG", "pattern": "University of Maryland,"},
{"label": "ORG", "pattern": "Carnegie Corporation"},
{"label": "ORG", "pattern": "Katzenbach Committee"},
{"label": "ORG", "pattern": "Democratic Congress"},
{"label": "ORG", "pattern": "National Endowment for Democracy"},
{"label": "ORG", "pattern": "Congress for Cultural Freedom"},
{"label": "ORG", "pattern": "Cambridge University Press"},
{"label": "ORG", "pattern": "Association of National Security Alumni"},
{"label": "ORG", "pattern": "Avon Books"},
{"label": "ORG", "pattern": "Ministry of Defence"},
{"label": "ORG", "pattern": "Ramparts Press"},
{"label": "ORG", "pattern": "Grove Press"},
{"label": "ORG", "pattern": "National Association of Scholars"},
{"label": "ORG", "pattern": "Council on Foreign Relations"},
{"label": "ORG", "pattern": "Symbionese Liberation Army"},
{"label": "ORG", "pattern": "Washington Times"},
{"label": "ORG", "pattern": "Center for Strategic and International Studies"},
{"label": "ORG", "pattern": "School of Foreign Service"},
{"label": "ORG", "pattern": "The Anglo-American Establishment"},
{"label": "ORG", "pattern": "Centre of Eternity"},
{"label": "ORG", "pattern": "Ruling Elite"},
{"label": "ORG", "pattern": "US Congress"},
{"label": "ORG", "pattern": "National Academy of Sciences"},
{"label": "ORG", "pattern": "Ramparts"},
{"label": "NULL", "pattern": "Bab"},
{"label": "LOC", "pattern": "Kingdom of God"},
{"label": "LOC", "pattern": "Frankfurt airport"},
{"label": "LOC", "pattern": "Dealey Plaza"},
{"label": "LOC", "pattern": "Central Africa"},
{"label": "LOC", "pattern": "Fort Detrick"},
{"label": "LOC", "pattern": "Clifton suspension bridge"},
{"label": "NULL", "pattern": "mandkind"},
{"label": "MISC", "pattern": "AIDS virus"},
{"label": "MISC", "pattern": "AIDS"},
{"label": "MISC", "pattern": "AIDS-1"},
{"label": "MISC", "pattern": "AIDS VIRUS"},
{"label": "MISC", "pattern": "Human Adult Leukemia virus"},
{"label": "MISC", "pattern": "LAV"},
{"label": "MISC", "pattern": "ARC"},
{"label": "MISC", "pattern": "HTLV-1"},
{"label": "MISC", "pattern": "HTLV-III"},
{"label": "MISC", "pattern": "HTLV"},
{"label": "MISC", "pattern": "HIV"},
{"label": "MISC", "pattern": "HIV virus"},
{"label": "MISC", "pattern": "Sheep Visna Virus"},
{"label": "MISC", "pattern": "Pneumocystis carinii pneumonia"},
{"label": "MISC", "pattern": "Kaposi's sarcoma"},
{"label": "MISC", "pattern": "Smallpox"},
{"label": "MISC", "pattern": "Polio"},
{"label": "MISC", "pattern": "LSD"},
{"label": "MISC", "pattern": "Poison"},
{"label": "MISC", "pattern": "Acquired Immuno-Deficiency Syndrome"},
{"label": "MISC", "pattern": "Equal Opportunity Employer"},
{"label": "MISC", "pattern": "Visna"},
{"label": "MISC", "pattern": "visna"},
{"label": "MISC", "pattern": "visna virus"},
{"label": "EVENT", "pattern": "Korean War"},
{"label": "EVENT", "pattern": "Railway Mania"},
{"label": "EVENT", "pattern": "Vietnam War"},
{"label": "EVENT", "pattern": "Apollo 11"},
{"label": "EVENT", "pattern": "Johnson's war on poverty"},
{"label": "MISC", "pattern": "AIDS"},
{"label": "NORP", "pattern": "Chinesse"},
{"label": "NORP", "pattern": "Vietnamese"},
{"label": "NORP", "pattern": "multiculturalists"},
{"label": "NORP", "pattern": "Vietnamese"},
{"label": "NORP", "pattern": "Anglo-Saxon"},
]
ruler.add_patterns(patterns)
workingDir = os.getcwd()
CollPath = os.path.join(workingDir, '../regexConspTest')
outputPath = os.path.join(workingDir, 'personTestingOutput/')
# Everything in original conspiracy directory.
insideDir = os.listdir(CollPath)
print(insideDir)
# Copies files in case they do not exist
def copyTextFiles(file):
content = []
# Reads the contents of file, and saves each line of file into the content array.
with open(CollPath + "/" + file, 'r', encoding='utf8') as inFile:
for line in inFile:
content.append(line)
print(" ~~~~~~~~~~~~~~~~~~~~~~~~~~~ copying " + file + " ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ")
inFile.close()
# With the contents copied, a loop will go through the array and write it all in a new file in output folder.
with open(outputPath + "/" + file, 'w', encoding='utf8') as f:
for line in content:
f.write(str(line))
# Function runs through the tokens of given file. Entities are stored in array, then returned. Called by regexFile().
def entitycollector(tokens):
# creates a new file that includes all of the found entities.
with open('conspPERSON.txt', 'w') as f:
entities = {}
# goes through each entity in the token list.
for ent in sorted(tokens.ents):
entityInfo = [ent.text, ent.label_]
stringify = str(entityInfo)
f.write(stringify)
f.write('\n')
entities[ent.text] = ent.label_
# return all entities with its label and text.
return entities
# Function runs regex through given file.
def regexFile(file):
fileDir = os.path.join(outputPath, file)
with PySaxonProcessor(license=False) as proc:
# grabs the original xml file and stores it in a variable for later. this some xquery bs
xml = open(fileDir, encoding='utf-8').read()
xp = proc.new_xpath_processor()
node = proc.parse_xml(xml_text=xml)
xp.set_context(xdm_item=node)
# xquery goes through original text, and stores it all in a single string.
xpath = xp.evaluate('//p ! normalize-space() => string-join()')
string = str(xpath)
# regex goes through the text and deletes anything that is not a letter or space.
cleanedText = regex.sub(r'[^A-z ]+', ' ', string)
cleanedText = regex.sub(r'\n+', ' ', cleanedText)
# gets the tokens of the clean text.
tokens = nlp(cleanedText)
wrappedText = xml
# grabs all the entities in file and stores it in a list/array.
dictEntities = entitycollector(tokens)
# if anything exists in the list, the following code will run.
if dictEntities:
# it will check through each entity in the list and see its entity type. it is looking for "PERSON" tokens
# in this instance, which includes of nouns and names.
for entity in dictEntities.keys():
if dictEntities[entity] == "PERSON" or dictEntities[entity] == "LOC" or dictEntities[entity] == "ORG" or dictEntities[entity] == "GPE" or dictEntities[entity] == "NORP" or dictEntities[entity] == "EVENT":
# key_template variable is the elements we wrap around found instances.
key_template = "<ent type='" + dictEntities[entity] + "'>" + entity + "</ent>"
# loops through wrappedText until all entities are wrapped.
wrappedText = wrappedText.replace(entity, key_template)
# Saves newly wrapped elements and then writes it into new file.
with open(fileDir, 'w', encoding='utf8') as f:
f.write(wrappedText)
print("WRAPPING " + entity)
checkTags(file)
# ebb: Added above line to send the tagged file to the checkTags() function for cleaning.
# This part of the code is a WIP.
# ebb: I just activated it, and it works! (Nice job.) I altered it just a bit. May need more regexes to match.
## It tries to find weird or invalid elements/tags and fix them.
def checkTags(file):
content = []
fileDir = os.path.join(outputPath, file)
with open(fileDir, 'r', encoding='utf8') as inFile:
for line in inFile:
content.append(line)
# With the contents copied, a loop will go through the array and write it all in a new file in output folder.
with open(fileDir, 'w', encoding='utf8') as f:
for line in content:
# match = regex.search(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", line)
# if match:
# print("broken line found, fixing...")
# ebb: NOTE: IF this function only processes a line when there's a regex match, we'd have a serious problem:
# we'd not output the rest of the file--only the cleaned matches. So the output files would be mostly empty!
# Better to just string-clean every line using regex.sub(). Where there's no regex match, no substitution will happen.
origLine = line
# newLine = regex.sub(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", r"\1\2\3",line)
# <spe<ent type='ORG'>cia</ent>l>
newLine = regex.sub(r"(</?spe)<ent type='ORG'>(cia)</ent>(l>)", r"\1\2\3", origLine)
# newLine = regex.sub(r"(<)<ent type='ORG'>(di)</ent>(v>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"<name type='\w+?'>(\w+?)</name>(\w+)", r"\1\2", newLine)
# ebb: I'm repeating the above just in case of the weird event of triple or quadruple nested <ent> tags in <ent> tags.
# We saw it happen on the LOTR project and running it through multiple passes of the above line ultimately got rid of them all
# preserving only the outermost tags.
newLine = regex.sub(r"(<ent type=')<ent type='ORG'>(ORG)</ent>('>)", r"\1\2\3", newLine)
#
# <spe<ent type='ORG'>cia</ent>l>
# <<ent type='ORG'>di</ent>v>
if origLine != newLine:
print("broken line found, fixing...")
print(origLine + "\n INTO.")
print(newLine)
f.write(str(newLine))
print("File checking finished.")
for file in insideDir:
copyTextFiles(file)
regexFile(file)
#checkTags(file)
# ebb: You don't really want to activate checkTags here,
# because it would run over the untagged input files.