mirror of
https://github.com/nhammer514/textfiles-politics.git
synced 2024-12-27 16:29:33 -05:00
823 lines
44 KiB
Python
823 lines
44 KiB
Python
import spacy
|
|
from collections import Counter
|
|
import re as regex
|
|
import os
|
|
from saxonche import PySaxonProcessor
|
|
|
|
|
|
#### Loads all of the necessary variables and functions.
|
|
nlp = spacy.cli.download("en_core_web_lg")
|
|
nlp = spacy.load("en_core_web_lg")
|
|
#########################################################################################
|
|
# ebb: After reading the NLP output, we know spaCy is making some mistakes.
|
|
# So, here let's try adding an EntityRuler to customize spaCy's classification. We need
|
|
# to configure this BEFORE we send the tokens off to nlp() for processing.
|
|
##########################################################################################
|
|
# Create the EntityRuler and set it so the ner comes after, so OUR rules take precedence
|
|
# Sources:
|
|
# W. J. B. Mattingly: https://ner.pythonhumanities.com/02_01_spaCy_Entity_Ruler.html
|
|
# spaCy documentation on NER Entity Ruler: https://spacy.io/usage/rule-based-matching#entityruler
|
|
config = {"spans_key": None, "annotate_ents": True, "overwrite": True, "validate": True}
|
|
ruler = nlp.add_pipe("span_ruler", before="ner", config=config)
|
|
# 2023-04-07: ebb: NOTE: before="ner" setting seems to allow the spaCy NER rules to prevail over these patterns where
|
|
# there is a conflict.
|
|
# after="ner" means that the spaCy NER is TOTALLY OVERWRITTEN and invalidated by our patterns.
|
|
|
|
# Notes: Mattingly has this: ruler = nlp.add_pipe("entity_ruler", after="ner", config={"validate": True})
|
|
# But this only works when spaCy doesn't recognize a word / phrase as a named entity of any kind.
|
|
# If it recognizes a named entity but tags it wrong, we correct it with the span_ruler, not the entity_ruler
|
|
patterns = [
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^-\w+?"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.$"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\^+"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^\w\w$"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^[a-z]+\s+[a-z]+$"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?__{2,}.*?$"}}]},
|
|
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)"}}]},
|
|
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN\s+NETWORK"}}]},
|
|
# ebb: Don't match on any single characters!
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[A-Z]{2,}[A-Z][a-z]+"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[a-z]{2,}[A-Z][a-z]+"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?[a-z][A-Z].*?$"}}]},
|
|
# ebb: Above line attempts to stop matching things like Oak IslandThe Method
|
|
{"label": "NULL", "pattern": [{"TEXT" : {"REGEX": "^[Mm\-]+$"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\w+cia\w+"}}]},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "fed\w+"}}]},
|
|
# SOCIALISMBY RICHARD
|
|
# ebb: Above line attempts to stop matching things Mmm-mm or mm , etc.
|
|
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Babylon(ia)?"}}]},
|
|
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Christiani\s*ty"}}]},
|
|
{"label": "NULL", "pattern": "Christiani"},
|
|
{"label": "NULL", "pattern": "Parallel"},
|
|
{"label": "NULL", "pattern": "the user"},
|
|
{"label": "NULL", "pattern": "Advanced"},
|
|
{"label": "NULL", "pattern": "Believability"},
|
|
{"label": "NULL", "pattern": "Onesuch"},
|
|
{"label": "NULL", "pattern": "the People"},
|
|
{"label": "NULL", "pattern": "REPRINT"},
|
|
{"label": "NULL", "pattern": "The Next Banking Crisis"},
|
|
{"label": "NULL", "pattern": "Martini Glass"},
|
|
{"label": "NULL", "pattern": "the Sheriff"},
|
|
{"label": "NULL", "pattern": "Greets"},
|
|
{"label": "NULL", "pattern": "Families"},
|
|
{"label": "NULL", "pattern": "preparingits"},
|
|
{"label": "NULL", "pattern": "wintry"},
|
|
{"label": "NULL", "pattern": "Interested"},
|
|
{"label": "NULL", "pattern": "Time"},
|
|
{"label": "NULL", "pattern": "contra"},
|
|
{"label": "NULL", "pattern": "Mental Health"},
|
|
{"label": "LAW", "pattern": "Bill of Rights"},
|
|
{"label": "LAW", "pattern": "Emergency Detention Act"},
|
|
{"label": "LAW", "pattern": "Geneva Convention"},
|
|
{"label": "LAW", "pattern": "Official Secrets Act"},
|
|
{"label": "LAW", "pattern": "Executive Order"},
|
|
{"label": "LAW", "pattern": "State Constitution"},
|
|
{"label": "LAW", "pattern": "Constitution"},
|
|
{"label": "LAW", "pattern": "Martial Law"},
|
|
{"label": "LAW", "pattern": "Martial Rule"},
|
|
{"label": "LAW", "pattern": "Alaska Mental Health Bill"},
|
|
{"label": "LAW", "pattern": "Multilateral Protection of War Victims/Prisoners of War"},
|
|
{"label": "LAW", "pattern": "Multilateral Protection of War Victims/Civilian Persons"},
|
|
{"label": "LAW", "pattern": "Public Health Service Draft Act"},
|
|
{"label": "LAW", "pattern": "Uniform Mental Health Act"},
|
|
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Executive Order #[0-9]+"}}]},
|
|
{"label": "NULL", "pattern": "Median"},
|
|
{"label": "NULL", "pattern": "Next"},
|
|
{"label": "NULL", "pattern": "Daily"},
|
|
{"label": "NULL", "pattern": "Justice"},
|
|
{"label": "NULL", "pattern": "pro tem"},
|
|
{"label": "NULL", "pattern": "megs"},
|
|
{"label": "NULL", "pattern": "the Kingdom"},
|
|
{"label": "ORG", "pattern": "The Office of Strategic Services"},
|
|
{"label": "NULL", "pattern": "di"},
|
|
{"label": "NULL", "pattern": "econonic aid"},
|
|
{"label": "NULL", "pattern": "fed"},
|
|
{"label": "NULL", "pattern": "the Temple"},
|
|
{"label": "NULL", "pattern": "Said"},
|
|
{"label": "NULL", "pattern": "Cheez Whiz"},
|
|
{"label": "NULL", "pattern": "the Rich Discover Worthy"},
|
|
{"label": "NULL", "pattern": "Examiner"},
|
|
{"label": "NULL", "pattern": "msen"},
|
|
{"label": "NULL", "pattern": "ORG"},
|
|
{"label": "NULL", "pattern": "Physics A. Mathematical"},
|
|
{"label": "NULL", "pattern": "PALE"},
|
|
{"label": "NULL", "pattern": "Order"},
|
|
{"label": "NULL", "pattern": "Command"},
|
|
{"label": "NULL", "pattern": "jackboots"},
|
|
{"label": "NULL", "pattern": "Human Behavior"},
|
|
{"label": "NULL", "pattern": "SINCE LAWYERS OCCUPY"},
|
|
{"label": "WORK_OF_ART", "pattern": "Digha Nikaya"},
|
|
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "(Ludwig [Vv]an )?Beethoven"}}]},
|
|
{"label": "ORG", "pattern": "Falangist"},
|
|
{"label": "ORG", "pattern": "Congressional committee"},
|
|
{"label": "ORG", "pattern": "The Federal Bureau of Prisons"},
|
|
{"label": "ORG", "pattern": "American Red Cross"},
|
|
{"label": "ORG", "pattern": "Annals of Internal Medicine"},
|
|
{"label": "ORG", "pattern": "Houston Post"},
|
|
{"label": "ORG", "pattern": "Houston Chronicle"},
|
|
{"label": "ORG", "pattern": "Concentration Camp Program"},
|
|
{"label": "ORG", "pattern": "Operation Garden Plot"},
|
|
{"label": "ORG", "pattern": "federal government"},
|
|
{"label": "ORG", "pattern": "British intelligenc"},
|
|
{"label": "ORG", "pattern": "Composite Service Organization"},
|
|
{"label": "ORG", "pattern": "Psychological Operations Organization"},
|
|
{"label": "ORG", "pattern": "Council on Foreign Relations"},
|
|
{"label": "ORG", "pattern": "Dept. of Defense"},
|
|
{"label": "ORG", "pattern": "Mental Health Institution"},
|
|
{"label": "ORG", "pattern": "Dept. of Transportation"},
|
|
{"label": "ORG", "pattern": "Dept. of Justice"},
|
|
{"label": "ORG", "pattern": "L.E.A.F."},
|
|
{"label": "ORG", "pattern": "C.I.A."},
|
|
{"label": "ORG", "pattern": "J.C. Penney"},
|
|
{"label": "ORG", "pattern": "Law Enforcement Assistance Force"},
|
|
{"label": "ORG", "pattern": "Young Americans for Freedom"},
|
|
{"label": "ORG", "pattern": "Military Police Unit"},
|
|
{"label": "ORG", "pattern": "The Annals"},
|
|
{"label": "ORG", "pattern": "Inslaw"},
|
|
{"label": "ORG", "pattern": "Civil Affairs Operations"},
|
|
{"label": "ORG", "pattern": "Civil Affairs Organization"},
|
|
{"label": "ORG", "pattern": "Big Brother"},
|
|
{"label": "ORG", "pattern": "Big brother"},
|
|
{"label": "ORG", "pattern": "State Youthful Offenders Division"},
|
|
{"label": "ORG", "pattern": "The California State Bar's Standing Committee on Professional Responsibility and Conduct"},
|
|
{"label": "ORG", "pattern": "ILLUMINATI"},
|
|
{"label": "ORG", "pattern": "Alaska Bar Association"},
|
|
{"label": "ORG", "pattern": "University of Wisconsin"},
|
|
{"label": "ORG", "pattern": "University of Southern California"},
|
|
{"label": "ORG", "pattern": "ROTC"},
|
|
{"label": "ORG", "pattern": "Plunge"},
|
|
{"label": "ORG", "pattern": "Los Angeles Sheriff's Dept."},
|
|
{"label": "ORG", "pattern": "U.N. Security Council"},
|
|
{"label": "ORG", "pattern": "U.N."},
|
|
{"label": "ORG", "pattern": "Library of Congress"},
|
|
{"label": "ORG", "pattern": "International Congress on Mental health"},
|
|
{"label": "ORG", "pattern": "U. S . Secret Service"},
|
|
{"label": "ORG", "pattern": "Office of Thrift Supervision"},
|
|
{"label": "ORG", "pattern": "Comptroller of the Currency"},
|
|
{"label": "ORG", "pattern": "Urban Plunge"},
|
|
{"label": "ORG", "pattern": "State Department"},
|
|
{"label": "ORG", "pattern": "Dept. of State"},
|
|
{"label": "ORG", "pattern": "G.S.A."},
|
|
{"label": "ORG", "pattern": "State Dept."},
|
|
{"label": "ORG", "pattern": "California National Guard"},
|
|
{"label": "ORG", "pattern": "H.U.D."},
|
|
{"label": "ORG", "pattern": "H.E.W."},
|
|
{"label": "ORG", "pattern": "law enforcement"},
|
|
{"label": "ORG", "pattern": "Housing & Urban Development"},
|
|
{"label": "ORG", "pattern": "Dept. of Education"},
|
|
{"label": "NORP", "pattern": "Dropa"},
|
|
{"label": "NORP", "pattern": "Viet Cong"},
|
|
{"label": "NORP", "pattern": "Egyptian"},
|
|
{"label": "NORP", "pattern": "Lybian"},
|
|
{"label": "NORP", "pattern": "Cuban"},
|
|
{"label": "NORP", "pattern": "Japanese"},
|
|
{"label": "NORP", "pattern": "Nicaraguan"},
|
|
{"label": "NORP", "pattern": "African"},
|
|
{"label": "NORP", "pattern": "Indian"},
|
|
{"label": "NORP", "pattern": "Icelandic"},
|
|
{"label": "NORP", "pattern": "Russian"},
|
|
{"label": "NORP", "pattern": "Clandestinism"},
|
|
{"label": "NORP", "pattern": "Trilateralists"},
|
|
{"label": "NORP", "pattern": "Spaniard"},
|
|
{"label": "LOC", "pattern": "Vietnam Moratorium"},
|
|
{"label": "LOC", "pattern": "Paddington station"},
|
|
{"label": "LOC", "pattern": "David Munson Air Base"},
|
|
{"label": "LOC", "pattern": "Southeast Asia"},
|
|
{"label": "LOC", "pattern": "Eielson Air Force Base"},
|
|
{"label": "LOC", "pattern": "U.S. Army Reserves"},
|
|
{"label": "LOC", "pattern": "Bay of Pigs"},
|
|
{"label": "LOC", "pattern": "US Air Force L. Fletcher Prouty"},
|
|
{"label": "LOC", "pattern": "Bohemian Grove"},
|
|
{"label": "LOC", "pattern": "Broadway"},
|
|
{"label": "NORP", "pattern": "North American"},
|
|
{"label": "NULL", "pattern": "Northern"},
|
|
{"label": "GPE", "pattern": "Allanwood"},
|
|
{"label": "GPE", "pattern": "Westminster"},
|
|
{"label": "GPE", "pattern": "Portland"},
|
|
{"label": "GPE", "pattern": "Richmond"},
|
|
{"label": "GPE", "pattern": "Sacramento"},
|
|
{"label": "GPE", "pattern": "St Louis"},
|
|
{"label": "GPE", "pattern": "New Haven"},
|
|
{"label": "GPE", "pattern": "Milwaukee"},
|
|
{"label": "GPE", "pattern": "Little Rock"},
|
|
{"label": "GPE", "pattern": "Los Angeles"},
|
|
{"label": "GPE", "pattern": "El Paso"},
|
|
{"label": "GPE", "pattern": "Columbia"},
|
|
{"label": "GPE", "pattern": "Butte"},
|
|
{"label": "GPE", "pattern": "State College"},
|
|
{"label": "GPE", "pattern": "Williamsport"},
|
|
{"label": "GPE", "pattern": "Landsdale"},
|
|
{"label": "GPE", "pattern": "Newtown Square"},
|
|
{"label": "GPE", "pattern": "Allentown"},
|
|
{"label": "GPE", "pattern": "New Castle"},
|
|
{"label": "GPE", "pattern": "Beckley"},
|
|
{"label": "GPE", "pattern": "Alton"},
|
|
{"label": "GPE", "pattern": "Stubenville"},
|
|
{"label": "GPE", "pattern": "Buffalo"},
|
|
{"label": "GPE", "pattern": "Belleville"},
|
|
{"label": "GPE", "pattern": "Bloomington"},
|
|
{"label": "GPE", "pattern": "Carbondale"},
|
|
{"label": "GPE", "pattern": "Champaign"},
|
|
{"label": "GPE", "pattern": "Rock Island"},
|
|
{"label": "GPE", "pattern": "Birmingham"},
|
|
{"label": "GPE", "pattern": "Baltimore"},
|
|
{"label": "GPE", "pattern": "Anchorage"},
|
|
{"label": "GPE", "pattern": "Albuquerque"},
|
|
{"label": "GPE", "pattern": "Albany"},
|
|
{"label": "GPE", "pattern": "Washington DC"},
|
|
{"label": "GPE", "pattern": "Washington, D.C."},
|
|
{"label": "GPE", "pattern": "Avon Park"},
|
|
{"label": "GPE", "pattern": "Mill Point"},
|
|
{"label": "GPE", "pattern": "El Reno"},
|
|
{"label": "GPE", "pattern": "Zagreb"},
|
|
{"label": "GPE", "pattern": "ZAGREB"},
|
|
{"label": "LOC", "pattern": "Whiskey Flat"},
|
|
{"label": "LOC", "pattern": "El Salvador air base"},
|
|
{"label": "LOC", "pattern": "Greenville County"},
|
|
{"label": "LOC", "pattern": "Toyland"},
|
|
{"label": "LOC", "pattern": "Owl's Nest"},
|
|
{"label": "LOC", "pattern": "Dachau"},
|
|
{"label": "LOC", "pattern": "Hill Billies"},
|
|
{"label": "LOC", "pattern": "Cave Man's"},
|
|
{"label": "GPE", "pattern": "Nazareth"},
|
|
{"label": "GPE", "pattern": "Latin America"},
|
|
{"label": "GPE", "pattern": "U.S.S.R"},
|
|
{"label": "GPE", "pattern": "New York City"},
|
|
{"label": "GPE", "pattern": "Soviet Union"},
|
|
{"label": "GPE", "pattern": "Viet Nam"},
|
|
{"label": "GPE", "pattern": "U.S.A."},
|
|
{"label": "GPE", "pattern": "U.S."},
|
|
{"label": "GPE", "pattern": "U.S"},
|
|
{"label": "GPE", "pattern": "Saudi Arabia"},
|
|
{"label": "GPE", "pattern": "Washington, DC"},
|
|
{"label": "GPE", "pattern": "the Vatican"},
|
|
{"label": "GPE", "pattern": "British Empire"},
|
|
{"label": "GPE", "pattern": "Pennsylvania"},
|
|
{"label": "ORG", "pattern": "Warren Commission"},
|
|
{"label": "ORG", "pattern": "Hahn group"},
|
|
{"label": "ORG", "pattern": "U.S. Department of Health and Human Services"},
|
|
{"label": "ORG", "pattern": "American Council of Life Insurance"},
|
|
{"label": "ORG", "pattern": "Health Insurance Association of America."},
|
|
{"label": "ORG", "pattern": "U.S. District Court"},
|
|
{"label": "ORG", "pattern": "National Guard"},
|
|
{"label": "ORG", "pattern": "US Army"},
|
|
{"label": "ORG", "pattern": "London Underground"},
|
|
{"label": "ORG", "pattern": "US Justice Department"},
|
|
{"label": "ORG", "pattern": "Sur Coester"},
|
|
{"label": "ORG", "pattern": "SoftQuad Inc."},
|
|
{"label": "ORG", "pattern": "United Nations World Health Organization"},
|
|
{"label": "ORG", "pattern": "L.E.A.A."},
|
|
{"label": "ORG", "pattern": "Justice Dept."},
|
|
{"label": "ORG", "pattern": "Contemporary Research, Inc."},
|
|
{"label": "ORG", "pattern": "British Government"},
|
|
{"label": "ORG", "pattern": "Office of Criminal Justice Planning"},
|
|
{"label": "ORG", "pattern": "Dept. of Defense of the United States"},
|
|
{"label": "ORG", "pattern": "Dept. of Commerce"},
|
|
{"label": "ORG", "pattern": "W.A. Harriman & Co."},
|
|
{"label": "ORG", "pattern": "Securities C. Commission"},
|
|
{"label": "ORG", "pattern": "Dallas Citizens Council"},
|
|
{"label": "ORG", "pattern": "Round Table groups"},
|
|
{"label": "ORG", "pattern": "The U.S. Agriculture Department"},
|
|
{"label": "ORG", "pattern": "Perpetual Hidden Government"},
|
|
{"label": "ORG", "pattern": "Office of the Deputy Chief of Staff of Personnel"},
|
|
{"label": "ORG", "pattern": "Harriman Fifteen Corp."},
|
|
{"label": "ORG", "pattern": "Israeli secret service"},
|
|
{"label": "ORG", "pattern": "Bell Helicopter"},
|
|
{"label": "ORG", "pattern": "Eurocheque"},
|
|
{"label": "ORG", "pattern": "MasterCard"},
|
|
{"label": "ORG", "pattern": "CitiCorp"},
|
|
{"label": "ORG", "pattern": "Taymar, Inc."},
|
|
{"label": "ORG", "pattern": "Kerr McGree Nuclear Corporation"},
|
|
{"label": "ORG", "pattern": "Edward R. Murrow Center"},
|
|
{"label": "ORG", "pattern": "American Ship and Commerce Corp."},
|
|
{"label": "ORG", "pattern": "American government"},
|
|
{"label": "ORG", "pattern": "American Association for the Advancement of Science"},
|
|
{"label": "ORG", "pattern": "G.H. Walker & Co."},
|
|
{"label": "ORG", "pattern": "Office of Preparedness"},
|
|
{"label": "ORG", "pattern": "John Birch Society"},
|
|
{"label": "ORG", "pattern": "U.S. government"},
|
|
{"label": "ORG", "pattern": "Cleveland Field Office"},
|
|
{"label": "ORG", "pattern": "The John F. Kennedy Center for the Performing Arts Education Program"},
|
|
{"label": "ORG", "pattern": "United States government"},
|
|
{"label": "ORG", "pattern": "National Institute of Agrarian Reform"},
|
|
{"label": "ORG", "pattern": "National Institute of Mental Health"},
|
|
{"label": "ORG", "pattern": "INRA"},
|
|
{"label": "ORG", "pattern": "Dept. of the Army"},
|
|
{"label": "ORG", "pattern": "Criminal Justice System"},
|
|
{"label": "ORG", "pattern": "Sheriffs Dept."},
|
|
{"label": "ORG", "pattern": "L.E.A.A. Newsletter"},
|
|
{"label": "ORG", "pattern": "Texas School Book Depository"},
|
|
{"label": "ORG", "pattern": "Senate Select Committee on Intelligence"},
|
|
{"label": "ORG", "pattern": "House Committee on Assassinations"},
|
|
{"label": "ORG", "pattern": "House of Saud"},
|
|
{"label": "ORG", "pattern": "Federal Emergency Management Agency"},
|
|
{"label": "ORG", "pattern": "Special Forces Unit"},
|
|
{"label": "ORG", "pattern": "Defense Central Index"},
|
|
{"label": "ORG", "pattern": "Joint Chiefs of Staff"},
|
|
{"label": "ORG", "pattern": "Federal Information Center"},
|
|
{"label": "ORG", "pattern": "American Counter-intelligence Corps"},
|
|
{"label": "ORG", "pattern": "CIC"},
|
|
{"label": "ORG", "pattern": "British Establishment."},
|
|
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+([A-Z]\.\s+[A-Z][a-z]+)?(,\s+Jr\.)"}}]},
|
|
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+([A-Z]\.\s+[A-Z][a-z]+)?(\s+Jr\.)"}}]},
|
|
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]+\s+(([A-Z]\.\s+)?[A-Z][a-z]+([A-Z][a-z]+)?)?\s+Jr\."}}]},
|
|
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "[[A-Z][a-z]+\s+[A-Z]\.\s+([A-Z][a-z])?[A-Z][a-z]+"}}]},
|
|
{"label": "PERSON", "pattern": "Nostradameus"},
|
|
{"label": "PERSON", "pattern": "Harrison Edward Livingstone"},
|
|
{"label": "PERSON", "pattern": "J. Herbert Sawyer"},
|
|
{"label": "PERSON", "pattern": "L. Harmon Zeigler"},
|
|
{"label": "PERSON", "pattern": "C. Everett Koop"},
|
|
{"label": "PERSON", "pattern": "Edward L. van Roden"},
|
|
{"label": "PERSON", "pattern": "Kirschbaum"},
|
|
{"label": "PERSON", "pattern": "Raphael Shumacker"},
|
|
{"label": "PERSON", "pattern": "DAVID MELLOR"},
|
|
{"label": "PERSON", "pattern": "J. W. Willmott"},
|
|
{"label": "PERSON", "pattern": "Morris Ellowitz"},
|
|
{"label": "PERSON", "pattern": "Harry Thon"},
|
|
{"label": "PERSON", "pattern": "Locust"},
|
|
{"label": "PERSON", "pattern": "Nuri Al-Said"},
|
|
{"label": "PERSON", "pattern": "Faisal II"},
|
|
{"label": "PERSON", "pattern": "Abdul Llah"},
|
|
{"label": "PERSON", "pattern": "John Paul I"},
|
|
{"label": "PERSON", "pattern": "John Paul II"},
|
|
{"label": "PERSON", "pattern": "Rockefeller III"},
|
|
{"label": "PERSON", "pattern": "John D. Rockefeller III"},
|
|
{"label": "PERSON", "pattern": "John D. Rockefeller IV"},
|
|
{"label": "PERSON", "pattern": "William H. Draper III"},
|
|
{"label": "PERSON", "pattern": "Eduardo"},
|
|
{"label": "PERSON", "pattern": "G.H. Walker III"},
|
|
{"label": "PERSON", "pattern": "Daniel Gooch"},
|
|
{"label": "PERSON", "pattern": "Ramses II"},
|
|
{"label": "PERSON", "pattern": "Alexander II"},
|
|
{"label": "PERSON", "pattern": "Alexander V"},
|
|
{"label": "PERSON", "pattern": "Alexander IV"},
|
|
{"label": "PERSON", "pattern": "Alexander VI"},
|
|
{"label": "PERSON", "pattern": "CARROLL QUIGLEY"},
|
|
{"label": "PERSON", "pattern": "James I"},
|
|
{"label": "PERSON", "pattern": "Oswald II"},
|
|
{"label": "PERSON", "pattern": "Oswald I"},
|
|
{"label": "PERSON", "pattern": "Umberto I"},
|
|
{"label": "PERSON", "pattern": "C. Hamilton Ellis"},
|
|
{"label": "PERSON", "pattern": "Alfred Ely Beach"},
|
|
{"label": "PERSON", "pattern": "Beach"},
|
|
{"label": "PERSON", "pattern": "Lord Milner"},
|
|
{"label": "PERSON", "pattern": "Agapetus I"},
|
|
{"label": "PERSON", "pattern": "Agapetus II"},
|
|
{"label": "PERSON", "pattern": "C. Victor Raiser II"},
|
|
{"label": "PERSON", "pattern": "C.V. Raiser II"},
|
|
{"label": "PERSON", "pattern": "Boniface VIII"},
|
|
{"label": "PERSON", "pattern": "Boniface VII"},
|
|
{"label": "PERSON", "pattern": "Boniface III"},
|
|
{"label": "PERSON", "pattern": "Boniface II"},
|
|
{"label": "PERSON", "pattern": "Boniface VI"},
|
|
{"label": "PERSON", "pattern": "Gonda"},
|
|
{"label": "PERSON", "pattern": "Richard II"},
|
|
{"label": "PERSON", "pattern": "Richard III"},
|
|
{"label": "PERSON", "pattern": "George Terwilliger III"},
|
|
{"label": "PERSON", "pattern": "C.E. Koop"},
|
|
{"label": "PERSON", "pattern": "Callistus I"},
|
|
{"label": "PERSON", "pattern": "Francis von Hapsburg"},
|
|
{"label": "PERSON", "pattern": "Malcom X"},
|
|
{"label": "PERSON", "pattern": "Victor IV"},
|
|
{"label": "PERSON", "pattern": "Carles C. Messick III"},
|
|
{"label": "PERSON", "pattern": "John D. Rockefeller IV"},
|
|
{"label": "PERSON", "pattern": "MARCONI"},
|
|
{"label": "PERSON", "pattern": "Marconi"},
|
|
{"label": "PERSON", "pattern": "Charles W. Bailey II"},
|
|
{"label": "PERSON", "pattern": "Charles I"},
|
|
{"label": "PERSON", "pattern": "Charles McKee"},
|
|
{"label": "PERSON", "pattern": "Charles X"},
|
|
{"label": "PERSON", "pattern": "Charles V"},
|
|
{"label": "PERSON", "pattern": "Jerome I"},
|
|
{"label": "PERSON", "pattern": "Mark I"},
|
|
{"label": "PERSON", "pattern": "John XXIII"},
|
|
{"label": "PERSON", "pattern": "Ferdinand I"},
|
|
{"label": "PERSON", "pattern": "Nicholas I"},
|
|
{"label": "PERSON", "pattern": "Nicholas II"},
|
|
{"label": "PERSON", "pattern": "Nicholas V"},
|
|
{"label": "PERSON", "pattern": "John Zajac"},
|
|
{"label": "PERSON", "pattern": "Sylvester I"},
|
|
{"label": "PERSON", "pattern": "Sylvester II"},
|
|
{"label": "PERSON", "pattern": "Engenius IV"},
|
|
{"label": "PERSON", "pattern": "Philip IV"},
|
|
{"label": "PERSON", "pattern": "Stephen II"},
|
|
{"label": "PERSON", "pattern": "Stephen VI"},
|
|
{"label": "PERSON", "pattern": "Pious X"},
|
|
{"label": "PERSON", "pattern": "Honorius II"},
|
|
{"label": "PERSON", "pattern": "Sergius III"},
|
|
{"label": "PERSON", "pattern": "Adrian I"},
|
|
{"label": "PERSON", "pattern": "Adrian VI"},
|
|
{"label": "PERSON", "pattern": "William F. Hamilton III"},
|
|
{"label": "PERSON", "pattern": "James Baker III"},
|
|
{"label": "PERSON", "pattern": "William Stamps Farish III"},
|
|
{"label": "PERSON", "pattern": "Elizabeth II"},
|
|
{"label": "PERSON", "pattern": "Anacletus II"},
|
|
{"label": "PERSON", "pattern": "Edward II"},
|
|
{"label": "PERSON", "pattern": "Edward III"},
|
|
{"label": "PERSON", "pattern": "Edward V"},
|
|
{"label": "PERSON", "pattern": "Matthew XVI"},
|
|
{"label": "PERSON", "pattern": "Thothmes III"},
|
|
{"label": "PERSON", "pattern": "Harry Hurt III"},
|
|
{"label": "PERSON", "pattern": "Napoleon III"},
|
|
{"label": "PERSON", "pattern": "Clement VIII"},
|
|
{"label": "PERSON", "pattern": "Clement I"},
|
|
{"label": "PERSON", "pattern": "Clement VI"},
|
|
{"label": "PERSON", "pattern": "Clement IV"},
|
|
{"label": "PERSON", "pattern": "Pius X"},
|
|
{"label": "PERSON", "pattern": "Pius II"},
|
|
{"label": "PERSON", "pattern": "Pius IX"},
|
|
{"label": "PERSON", "pattern": "Pius XI"},
|
|
{"label": "PERSON", "pattern": "Pius XII"},
|
|
{"label": "PERSON", "pattern": "Leo IX"},
|
|
{"label": "PERSON", "pattern": "Leo X"},
|
|
{"label": "PERSON", "pattern": "Leo III"},
|
|
{"label": "PERSON", "pattern": "Leo XIII"},
|
|
{"label": "PERSON", "pattern": "Amenhotep IV"},
|
|
{"label": "PERSON", "pattern": "Charles W. Bailey II"},
|
|
{"label": "PERSON", "pattern": "Constantine"},
|
|
{"label": "PERSON", "pattern": "Tut-ankh-amen"},
|
|
{"label": "PERSON", "pattern": "Sixtus V"},
|
|
{"label": "PERSON", "pattern": "Sixtus IV"},
|
|
{"label": "PERSON", "pattern": "Benedict XIV"},
|
|
{"label": "PERSON", "pattern": "Benedict VI"},
|
|
{"label": "PERSON", "pattern": "Benedict IX"},
|
|
{"label": "PERSON", "pattern": "Benedict XV"},
|
|
{"label": "PERSON", "pattern": "George I"},
|
|
{"label": "PERSON", "pattern": "George III"},
|
|
{"label": "PERSON", "pattern": "Gregory VII"},
|
|
{"label": "PERSON", "pattern": "Gregory I"},
|
|
{"label": "PERSON", "pattern": "Gregory XVI"},
|
|
{"label": "PERSON", "pattern": "Gregory IX"},
|
|
{"label": "PERSON", "pattern": "Constantine II"},
|
|
{"label": "PERSON", "pattern": "Albert I"},
|
|
{"label": "PERSON", "pattern": "Albert V. Bryan Jr"},
|
|
{"label": "PERSON", "pattern": "Alfonso XII"},
|
|
{"label": "PERSON", "pattern": "Alfonso XIII"},
|
|
{"label": "PERSON", "pattern": "Gustavus III"},
|
|
{"label": "PERSON", "pattern": "Gustav III"},
|
|
{"label": "PERSON", "pattern": "Felix III"},
|
|
{"label": "PERSON", "pattern": "Valentinian III"},
|
|
{"label": "PERSON", "pattern": "Innocent III"},
|
|
{"label": "PERSON", "pattern": "Innocent IV"},
|
|
{"label": "PERSON", "pattern": "Frederick, II"},
|
|
{"label": "PERSON", "pattern": "Frederick II"},
|
|
{"label": "PERSON", "pattern": "Theodosius I"},
|
|
{"label": "PERSON", "pattern": "Henry VIII"},
|
|
{"label": "PERSON", "pattern": "Henry VI"},
|
|
{"label": "PERSON", "pattern": "Henry IV"},
|
|
{"label": "PERSON", "pattern": "Louis IX"},
|
|
{"label": "PERSON", "pattern": "Louis XVI"},
|
|
{"label": "PERSON", "pattern": "Joseph II"},
|
|
{"label": "PERSON", "pattern": "Catherine II"},
|
|
{"label": "PERSON", "pattern": "James D Bryant II"},
|
|
{"label": "PERSON", "pattern": "Paul III"},
|
|
{"label": "PERSON", "pattern": "Paul II"},
|
|
{"label": "PERSON", "pattern": "Julius II"},
|
|
{"label": "PERSON", "pattern": "Paul VI"},
|
|
{"label": "PERSON", "pattern": "William II"},
|
|
{"label": "PERSON", "pattern": "William III"},
|
|
{"label": "PERSON", "pattern": "Baldwin II"},
|
|
{"label": "PERSON", "pattern": "Urban VIII"},
|
|
{"label": "PERSON", "pattern": "Charles Goodhue, III"},
|
|
{"label": "PERSON", "pattern": "Edwin Meese, III"},
|
|
{"label": "PERSON", "pattern": "Thomas Downing"},
|
|
{'label': 'PERSON', 'pattern': 'James "Bo" Gritz'},
|
|
{"label": "PERSON", "pattern": "James Earl Ray"},
|
|
{"label": "PERSON", "pattern": "S. Rilling"},
|
|
{"label": "PERSON", "pattern": "Scott Weekly"},
|
|
{"label": "PERSON", "pattern": "Ford, L.H"},
|
|
{"label": "PERSON", "pattern": "A. Vilenkin"},
|
|
{"label": "PERSON", "pattern": "Renate Viebahn"},
|
|
{"label": "PERSON", "pattern": "Lustick, Ian S."},
|
|
{"label": "PERSON", "pattern": "Friedman, Robert I."},
|
|
{"label": "PERSON", "pattern": "Howard Sprague"},
|
|
{"label": "PERSON", "pattern": "Mark Clark"},
|
|
{"label": "PERSON", "pattern": "Owen"},
|
|
{"label": "PERSON", "pattern": "Theodore Shackley"},
|
|
{"label": "PERSON", "pattern": "Clines"},
|
|
{"label": "PERSON", "pattern": "L. F. Prouty"},
|
|
{"label": "PERSON", "pattern": "L. Fletcher Prouty"},
|
|
{"label": "PERSON", "pattern": "Knight, Amy W."},
|
|
{"label": "PERSON", "pattern": "Martindale"},
|
|
{"label": "PERSON", "pattern": "R. Knox"},
|
|
{"label": "PERSON", "pattern": "Leon Oswald"},
|
|
{"label": "PERSON", "pattern": "H. Ross Perot"},
|
|
{"label": "PERSON", "pattern": "Stanly R. Larsen"},
|
|
{"label": "PERSON", "pattern": "L.E. Allen Jr."},
|
|
{"label": "PERSON", "pattern": "A. Lewis"},
|
|
{"label": "PERSON", "pattern": "D. Fleming"},
|
|
{"label": "PERSON", "pattern": "J. Edgar Hoover"},
|
|
{"label": "PERSON", "pattern": "V. Pirie"},
|
|
{"label": "PERSON", "pattern": "Nancy B."},
|
|
{"label": "PERSON", "pattern": "Ronald Reagan"},
|
|
{"label": "PERSON", "pattern": "P. Gibbs"},
|
|
{"label": "PERSON", "pattern": "W. Citrine"},
|
|
{"label": "PERSON", "pattern": "E. Moore"},
|
|
{"label": "PERSON", "pattern": "Lyndon Johnson"},
|
|
{"label": "PERSON", "pattern": "John Kennedy"},
|
|
{"label": "PERSON", "pattern": "Zia ul-Haque"},
|
|
{"label": "PERSON", "pattern": "Ronald Payne"},
|
|
{"label": "PERSON", "pattern": "R. Muldoon"},
|
|
{"label": "PERSON", "pattern": "F. Orr"},
|
|
{"label": "PERSON", "pattern": "Frank H. Schwable"},
|
|
{"label": "PERSON", "pattern": "R. F. Doyle"},
|
|
{"label": "PERSON", "pattern": "David Munson"},
|
|
{"label": "PERSON", "pattern": "Frances Clark"},
|
|
{"label": "PERSON", "pattern": "G. Gordon Broadbent"},
|
|
{"label": "PERSON", "pattern": "J. Segal"},
|
|
{"label": "PERSON", "pattern": "F. Brangwyn"},
|
|
{"label": "PERSON", "pattern": "A. Luchaire"},
|
|
{"label": "PERSON", "pattern": "L. Segal"},
|
|
{"label": "PERSON", "pattern": "R. William Davis"},
|
|
{"label": "PERSON", "pattern": "William W. Quinn"},
|
|
{"label": "PERSON", "pattern": "William R. Pabst"},
|
|
{"label": "PERSON", "pattern": "William Pabst"},
|
|
{"label": "PERSON", "pattern": "G. Maeferren"},
|
|
{"label": "PERSON", "pattern": "Dian-Lanz"},
|
|
{"label": "PERSON", "pattern": "Buria"},
|
|
{"label": "PERSON", "pattern": "North"},
|
|
{"label": "PERSON", "pattern": "Ortho III"},
|
|
{"label": "PERSON", "pattern": "Santo Trafficante"},
|
|
{"label": "PERSON", "pattern": "Felix Rodreguez"},
|
|
{"label": "PERSON", "pattern": "Stanly F. Yoles"},
|
|
{"label": "PERSON", "pattern": "G. B. Chisholm"},
|
|
{"label": "PERSON", "pattern": "Baron Kurt von Schroeder"},
|
|
{"label": "PERSON", "pattern": "George Bush"},
|
|
{"label": "PERSON", "pattern": "Roland Harriman"},
|
|
{"label": "PERSON", "pattern": "John Foster Dulles"},
|
|
{"label": "PERSON", "pattern": "Karen Silkwood"},
|
|
{"label": "PERSON", "pattern": "N. McQuire"},
|
|
{"label": "PERSON", "pattern": "William Highland"},
|
|
{"label": "PERSON", "pattern": "Harriman"},
|
|
{"label": "PERSON", "pattern": "William Saxby"},
|
|
{"label": "PERSON", "pattern": "Ellis O. Briggs"},
|
|
{"label": "PERSON", "pattern": "Daniel Ludwig"},
|
|
{"label": "PERSON", "pattern": "Robert C. Klowers"},
|
|
{"label": "PERSON", "pattern": "Tatum B. Laird"},
|
|
{"label": "PERSON", "pattern": "William F. Buckley, Jr."},
|
|
{"label": "PERSON", "pattern": "William Randolph Hearst, Jr."},
|
|
{"label": "PERSON", "pattern": "Justin Dart"},
|
|
{"label": "PERSON", "pattern": "Larry McDonald"},
|
|
{"label": "PERSON", "pattern": "Lievense"},
|
|
{"label": "PERSON", "pattern": "CASTRO"},
|
|
{"label": "PERSON", "pattern": "Erik Jonsson"},
|
|
{"label": "PERSON", "pattern": "Charles E. Allen"},
|
|
{"label": "PERSON", "pattern": "Laurence H. Shoup"},
|
|
{"label": "PERSON", "pattern": "David Rieff"},
|
|
{"label": "PERSON", "pattern": "Diamond"},
|
|
{"label": "PERSON", "pattern": "Donald DeFreeze"},
|
|
{"label": "PERSON", "pattern": "McGovern"},
|
|
{"label": "PERSON", "pattern": "McCarthy"},
|
|
{"label": "PERSON", "pattern": "John Connally"},
|
|
{"label": "PERSON", "pattern": "Lyndon LaRouche"},
|
|
{"label": "PERSON", "pattern": "LaRouche"},
|
|
{"label": "PERSON", "pattern": "W.H. Bowart"},
|
|
{"label": "PERSON", "pattern": "Lehrman"},
|
|
{"label": "PERSON", "pattern": "John McCone"},
|
|
{"label": "PERSON", "pattern": "Al Haig"},
|
|
{"label": "PERSON", "pattern": "J. Latimer Clark"},
|
|
{"label": "PERSON", "pattern": "T. W. Rammell"},
|
|
{"label": "PERSON", "pattern": "Elizabeth II"},
|
|
{"label": "PERSON", "pattern": "Martha Honey"},
|
|
{"label": "PERSON", "pattern": "Vince Bielski"},
|
|
{"label": "PERSON", "pattern": "Anton Chaitkin"},
|
|
{"label": "PERSON", "pattern": "Dean Burch"},
|
|
{"label": "PERSON", "pattern": "Burch"},
|
|
{"label": "PERSON", "pattern": "Gaius Caesar"},
|
|
{"label": "PERSON", "pattern": "Gaius Chaerea"},
|
|
{"label": "PERSON", "pattern": "McCone"},
|
|
{"label": "PERSON", "pattern": "Richard Nixon"},
|
|
{"label": "PERSON", "pattern": "David Wise"},
|
|
{"label": "PERSON", "pattern": "Zapruder"},
|
|
{"label": "PERSON", "pattern": "Mortimer J. Adler"},
|
|
{"label": "PERSON", "pattern": "Angela Davis"},
|
|
{"label": "PERSON", "pattern": "Segals"},
|
|
{"label": "PERSON", "pattern": "COLIN WALLACE"},
|
|
{"label": "PERSON", "pattern": "Thomas B. Ross"},
|
|
{"label": "PERSON", "pattern": "Bowart"},
|
|
{"label": "PERSON", "pattern": "Frank H. Schwable"},
|
|
{"label": "PERSON", "pattern": "John F. Kennedy"},
|
|
{"label": "PERSON", "pattern": "R. J. Biggar"},
|
|
{"label": "PERSON", "pattern": "David R. Hunter"},
|
|
{"label": "PERSON", "pattern": "Hugh Everett, III"},
|
|
{"label": "PERSON", "pattern": "Sigmund Diamond"},
|
|
{"label": "PERSON", "pattern": "Montagnier"},
|
|
{"label": "PERSON", "pattern": "W. Schmunger"},
|
|
{"label": "PERSON", "pattern": "Vince Bielski"},
|
|
{"label": "PERSON", "pattern": "Lee Harvey Oswald"},
|
|
{"label": "PERSON", "pattern": "Schmunger"},
|
|
{"label": "PERSON", "pattern": "Stranglove"},
|
|
{"label": "PERSON", "pattern": "James Calcutt"},
|
|
{"label": "PERSON", "pattern": "Florentine Giovanni Boccaccio"},
|
|
{"label": "PERSON", "pattern": "Prescott Bush"},
|
|
{"label": "PERSON", "pattern": "Maxwell"},
|
|
{"label": "PERSON", "pattern": "Daniel Schorr"},
|
|
{"label": "PERSON", "pattern": "Allard K. Lowenstein"},
|
|
{"label": "PERSON", "pattern": "Daniel Patrick Moynihan"},
|
|
{"label": "ORG", "pattern": "National Institute of Health"},
|
|
{"label": "ORG", "pattern": "Department of Justice"},
|
|
{"label": "ORG", "pattern": "Yakuza"},
|
|
{"label": "ORG", "pattern": "YAKUZA"},
|
|
{"label": "ORG", "pattern": "U.S. Department of Justice"},
|
|
{"label": "ORG", "pattern": "FBI Field Office"},
|
|
{"label": "ORG", "pattern": "Bureau of Investigation"},
|
|
{"label": "ORG", "pattern": "Dallas Citizens Council"},
|
|
{"label": "ORG", "pattern": "University of Maryland,"},
|
|
{"label": "ORG", "pattern": "Carnegie Corporation"},
|
|
{"label": "ORG", "pattern": "Katzenbach Committee"},
|
|
{"label": "ORG", "pattern": "Democratic Congress"},
|
|
{"label": "ORG", "pattern": "National Endowment for Democracy"},
|
|
{"label": "ORG", "pattern": "Congress for Cultural Freedom"},
|
|
{"label": "ORG", "pattern": "Cambridge University Press"},
|
|
{"label": "ORG", "pattern": "Association of National Security Alumni"},
|
|
{"label": "ORG", "pattern": "Avon Books"},
|
|
{"label": "ORG", "pattern": "Ministry of Defence"},
|
|
{"label": "ORG", "pattern": "Ramparts Press"},
|
|
{"label": "ORG", "pattern": "Grove Press"},
|
|
{"label": "ORG", "pattern": "National Association of Scholars"},
|
|
{"label": "ORG", "pattern": "Council on Foreign Relations"},
|
|
{"label": "ORG", "pattern": "Symbionese Liberation Army"},
|
|
{"label": "ORG", "pattern": "Washington Times"},
|
|
{"label": "ORG", "pattern": "Center for Strategic and International Studies"},
|
|
{"label": "ORG", "pattern": "School of Foreign Service"},
|
|
{"label": "ORG", "pattern": "The Anglo-American Establishment"},
|
|
{"label": "ORG", "pattern": "Centre of Eternity"},
|
|
{"label": "ORG", "pattern": "Ruling Elite"},
|
|
{"label": "ORG", "pattern": "US Congress"},
|
|
{"label": "ORG", "pattern": "National Academy of Sciences"},
|
|
{"label": "ORG", "pattern": "Ramparts"},
|
|
{"label": "NULL", "pattern": "Bab"},
|
|
{"label": "LOC", "pattern": "Kingdom of God"},
|
|
{"label": "LOC", "pattern": "Frankfurt airport"},
|
|
{"label": "LOC", "pattern": "Dealey Plaza"},
|
|
{"label": "LOC", "pattern": "Central Africa"},
|
|
{"label": "LOC", "pattern": "Fort Detrick"},
|
|
{"label": "LOC", "pattern": "Clifton suspension bridge"},
|
|
{"label": "NULL", "pattern": "mandkind"},
|
|
{"label": "MISC", "pattern": "AIDS virus"},
|
|
{"label": "MISC", "pattern": "AIDS"},
|
|
{"label": "MISC", "pattern": "AIDS-1"},
|
|
{"label": "MISC", "pattern": "AIDS VIRUS"},
|
|
{"label": "MISC", "pattern": "Human Adult Leukemia virus"},
|
|
{"label": "MISC", "pattern": "LAV"},
|
|
{"label": "MISC", "pattern": "ARC"},
|
|
{"label": "MISC", "pattern": "HTLV-1"},
|
|
{"label": "MISC", "pattern": "HTLV-III"},
|
|
{"label": "MISC", "pattern": "HTLV"},
|
|
{"label": "MISC", "pattern": "HIV"},
|
|
{"label": "MISC", "pattern": "HIV virus"},
|
|
{"label": "MISC", "pattern": "Sheep Visna Virus"},
|
|
{"label": "MISC", "pattern": "Pneumocystis carinii pneumonia"},
|
|
{"label": "MISC", "pattern": "Kaposi's sarcoma"},
|
|
{"label": "MISC", "pattern": "Smallpox"},
|
|
{"label": "MISC", "pattern": "Polio"},
|
|
{"label": "MISC", "pattern": "LSD"},
|
|
{"label": "MISC", "pattern": "Poison"},
|
|
{"label": "MISC", "pattern": "Acquired Immuno-Deficiency Syndrome"},
|
|
{"label": "MISC", "pattern": "Equal Opportunity Employer"},
|
|
{"label": "MISC", "pattern": "Visna"},
|
|
{"label": "MISC", "pattern": "visna"},
|
|
{"label": "MISC", "pattern": "visna virus"},
|
|
{"label": "EVENT", "pattern": "Korean War"},
|
|
{"label": "EVENT", "pattern": "Railway Mania"},
|
|
{"label": "EVENT", "pattern": "Vietnam War"},
|
|
{"label": "EVENT", "pattern": "Apollo 11"},
|
|
{"label": "EVENT", "pattern": "Johnson's war on poverty"},
|
|
{"label": "MISC", "pattern": "AIDS"},
|
|
{"label": "NORP", "pattern": "Chinesse"},
|
|
{"label": "NORP", "pattern": "Vietnamese"},
|
|
{"label": "NORP", "pattern": "multiculturalists"},
|
|
{"label": "NORP", "pattern": "Vietnamese"},
|
|
{"label": "NORP", "pattern": "Anglo-Saxon"},
|
|
]
|
|
ruler.add_patterns(patterns)
|
|
|
|
workingDir = os.getcwd()
|
|
CollPath = os.path.join(workingDir, '../regexConspTest')
|
|
outputPath = os.path.join(workingDir, 'personTestingOutput/')
|
|
# Everything in original conspiracy directory.
|
|
insideDir = os.listdir(CollPath)
|
|
print(insideDir)
|
|
|
|
# Copies files in case they do not exist
|
|
def copyTextFiles(file):
|
|
content = []
|
|
# Reads the contents of file, and saves each line of file into the content array.
|
|
with open(CollPath + "/" + file, 'r', encoding='utf8') as inFile:
|
|
for line in inFile:
|
|
content.append(line)
|
|
print(" ~~~~~~~~~~~~~~~~~~~~~~~~~~~ copying " + file + " ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ")
|
|
inFile.close()
|
|
# With the contents copied, a loop will go through the array and write it all in a new file in output folder.
|
|
with open(outputPath + "/" + file, 'w', encoding='utf8') as f:
|
|
for line in content:
|
|
f.write(str(line))
|
|
|
|
# Function runs through the tokens of given file. Entities are stored in array, then returned. Called by regexFile().
|
|
def entitycollector(tokens):
|
|
# creates a new file that includes all of the found entities.
|
|
with open('conspPERSON.txt', 'w') as f:
|
|
entities = {}
|
|
# goes through each entity in the token list.
|
|
for ent in sorted(tokens.ents):
|
|
entityInfo = [ent.text, ent.label_]
|
|
stringify = str(entityInfo)
|
|
f.write(stringify)
|
|
f.write('\n')
|
|
entities[ent.text] = ent.label_
|
|
# return all entities with its label and text.
|
|
return entities
|
|
|
|
# Function runs regex through given file.
|
|
def regexFile(file):
|
|
fileDir = os.path.join(outputPath, file)
|
|
with PySaxonProcessor(license=False) as proc:
|
|
# grabs the original xml file and stores it in a variable for later. this some xquery bs
|
|
xml = open(fileDir, encoding='utf-8').read()
|
|
xp = proc.new_xpath_processor()
|
|
node = proc.parse_xml(xml_text=xml)
|
|
xp.set_context(xdm_item=node)
|
|
|
|
# xquery goes through original text, and stores it all in a single string.
|
|
xpath = xp.evaluate('//p ! normalize-space() => string-join()')
|
|
string = str(xpath)
|
|
|
|
# regex goes through the text and deletes anything that is not a letter or space.
|
|
cleanedText = regex.sub(r'[^A-z ]+', ' ', string)
|
|
cleanedText = regex.sub(r'\n+', ' ', cleanedText)
|
|
|
|
# gets the tokens of the clean text.
|
|
tokens = nlp(cleanedText)
|
|
|
|
wrappedText = xml
|
|
# grabs all the entities in file and stores it in a list/array.
|
|
dictEntities = entitycollector(tokens)
|
|
# if anything exists in the list, the following code will run.
|
|
if dictEntities:
|
|
# it will check through each entity in the list and see its entity type. it is looking for "PERSON" tokens
|
|
# in this instance, which includes of nouns and names.
|
|
for entity in dictEntities.keys():
|
|
if dictEntities[entity] == "PERSON" or dictEntities[entity] == "LOC" or dictEntities[entity] == "ORG" or dictEntities[entity] == "GPE" or dictEntities[entity] == "NORP" or dictEntities[entity] == "EVENT":
|
|
# key_template variable is the elements we wrap around found instances.
|
|
key_template = "<ent type='" + dictEntities[entity] + "'>" + entity + "</ent>"
|
|
# loops through wrappedText until all entities are wrapped.
|
|
wrappedText = wrappedText.replace(entity, key_template)
|
|
# Saves newly wrapped elements and then writes it into new file.
|
|
with open(fileDir, 'w', encoding='utf8') as f:
|
|
f.write(wrappedText)
|
|
print("WRAPPING " + entity)
|
|
checkTags(file)
|
|
# ebb: Added above line to send the tagged file to the checkTags() function for cleaning.
|
|
|
|
# This part of the code is a WIP.
|
|
# ebb: I just activated it, and it works! (Nice job.) I altered it just a bit. May need more regexes to match.
|
|
## It tries to find weird or invalid elements/tags and fix them.
|
|
def checkTags(file):
|
|
content = []
|
|
fileDir = os.path.join(outputPath, file)
|
|
|
|
with open(fileDir, 'r', encoding='utf8') as inFile:
|
|
for line in inFile:
|
|
content.append(line)
|
|
# With the contents copied, a loop will go through the array and write it all in a new file in output folder.
|
|
with open(fileDir, 'w', encoding='utf8') as f:
|
|
for line in content:
|
|
# match = regex.search(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", line)
|
|
# if match:
|
|
# print("broken line found, fixing...")
|
|
# ebb: NOTE: IF this function only processes a line when there's a regex match, we'd have a serious problem:
|
|
# we'd not output the rest of the file--only the cleaned matches. So the output files would be mostly empty!
|
|
# Better to just string-clean every line using regex.sub(). Where there's no regex match, no substitution will happen.
|
|
origLine = line
|
|
# newLine = regex.sub(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", r"\1\2\3",line)
|
|
# <spe<ent type='ORG'>cia</ent>l>
|
|
newLine = regex.sub(r"(</?spe)<ent type='ORG'>(cia)</ent>(l>)", r"\1\2\3", origLine)
|
|
# newLine = regex.sub(r"(<)<ent type='ORG'>(di)</ent>(v>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
|
newLine = regex.sub(r"<name type='\w+?'>(\w+?)</name>(\w+)", r"\1\2", newLine)
|
|
# ebb: I'm repeating the above just in case of the weird event of triple or quadruple nested <ent> tags in <ent> tags.
|
|
# We saw it happen on the LOTR project and running it through multiple passes of the above line ultimately got rid of them all
|
|
# preserving only the outermost tags.
|
|
newLine = regex.sub(r"(<ent type=')<ent type='ORG'>(ORG)</ent>('>)", r"\1\2\3", newLine)
|
|
#
|
|
# <spe<ent type='ORG'>cia</ent>l>
|
|
# <<ent type='ORG'>di</ent>v>
|
|
if origLine != newLine:
|
|
print("broken line found, fixing...")
|
|
print(origLine + "\n INTO.")
|
|
print(newLine)
|
|
f.write(str(newLine))
|
|
print("File checking finished.")
|
|
|
|
for file in insideDir:
|
|
copyTextFiles(file)
|
|
regexFile(file)
|
|
#checkTags(file)
|
|
# ebb: You don't really want to activate checkTags here,
|
|
# because it would run over the untagged input files.
|