fresh python-tagged ents from personTagger.py

This commit is contained in:
ebeshero 2023-04-28 05:35:52 -04:00
parent 22a4906740
commit 0d68297335
331 changed files with 37694 additions and 34577 deletions

File diff suppressed because it is too large Load diff

View file

@ -29,13 +29,14 @@ ruler = nlp.add_pipe("span_ruler", before="ner", config=config)
patterns = [
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^-\w+?"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.$"}}]},
# ebb: Don't match on any single characters!
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\^+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^\w\w$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^[a-z]+\s+[a-z]+$"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?__{2,}.*?$"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)?"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN\s+NETWORK"}}]},
# ebb: Don't match on any single characters!
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Catholic(ism)?"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[A-Z]{2,}[A-Z][a-z]+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[a-z]{2,}[A-Z][a-z]+"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?[a-z][A-Z].*?$"}}]},
@ -771,7 +772,7 @@ patterns = [
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Lop Nor"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Gross Wannsee"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Groom Lake"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "North\s+?American"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "North\s+?[A-Z][a-z]+"}}]},
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Monk"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Northern"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Allanwood"}}]},
@ -790,12 +791,12 @@ patterns = [
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Columbia"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "FORT\s+?HUNT"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Butte"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "State\s+?College"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "State\s*College"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Williamsport"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Landsdale"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Newtown\s+?Square"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Newtown\s*Square"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Allentown"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "New\s+?Castle"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "New\s*Castle"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Beckley"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Alton"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Stubenville"}}]},
@ -819,25 +820,24 @@ patterns = [
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Albuquerque"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Albany"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "CORAL\s+?GABLES"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington\s+?DC"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,\s+?DC"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,\s+?D\.C\."}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,?\s*D\.?C\.?"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Avon\s+?Park"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Mill\s+?Point"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "El\s+?Reno"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Zagreb"}}]},
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "ZAGREB"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Whiskey\s+?Flat"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Whiskey\s*Flat"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "site\s+?S-4"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "LUNA"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "AREA\s+?51"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "El\s+?Salvador\s+?air\s+?base"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Greenville\s+?County"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Toyland"}}]},
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Sunday"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Sun"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Pole"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Qua\s+?Vieaf"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Viet\s+?Nam"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Viet\s*[Nn]am"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Owl's\s+?Nest"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Dachau"}}]},
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Hill\s+?Billies"}}]},
@ -1043,8 +1043,8 @@ patterns = [
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "KRLL"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "KRLLL"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EBE"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller\s+?III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+?D\.\s+?Rockefeller\s+?III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller\s*III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s*?D\.\s*Rockefeller\s+?III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Dennis\s+?DeConcini"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Phil\s+?Gramm"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "BILL\s+?HAMILTON"}}]},
@ -1054,13 +1054,13 @@ patterns = [
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Johnston"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Dodd"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Laurence\s+Rockefeller"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller\s+IV"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller,\s+Jr\."}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "William\s+?H\.\s+?Draper\s+?III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Laurence\s*Rockefeller"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller\s*IV"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller,\s*Jr\."}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "William\s*H\.\s*Draper\s*III"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Eduardo"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EARL\s+?W\.\s+?BRIAN"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EARL\s*W\.\s+?BRIAN"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "RICONOSCIUTO"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ramakrishna"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Don\s+?Ecker"}}]},
@ -1099,6 +1099,8 @@ patterns = [
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Boniface\s+?VI"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Gonda"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ollie\s+?North"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Oliver\s*North"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Mr\.\s*North"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "E\.\s+?Howard\s+?Hunt"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "MLK"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "RFK"}}]},
@ -1429,7 +1431,7 @@ patterns = [
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?National\s+?Association\s+?of\s+?Scholars"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Council\s+?on\s+?Foreign\s+?Relations"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Symbionese\s+?Liberation\s+?Army"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Washington\s+?Times"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Washington\s+?Post"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Internal\s+?Revenue\s+?Service"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Terminate\s+?With\s+?Extreme\s+?Prejudice"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Center\s+?for\s+?Strategic\s+?and\s+?International\s+?Studies"}}]},
@ -1438,6 +1440,7 @@ patterns = [
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Centre\s+?of\s+?Eternity"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Life\s+?magazine"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Full\s+?Disclosure\s+?Newspaper"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "North\s+American\s+Newspaper\s+Alliance"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Trilateral\s+?Commision"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Ruling\s+?Elite"}}]},
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?US\s+?Congress"}}]},
@ -1622,6 +1625,13 @@ def checkTags(file):
# newLine = regex.sub(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", r"\1\2\3",line)
# <spe<ent type='ORG'>cia</ent>l>
newLine = regex.sub(r"(</?spe)<ent type='ORG'>(cia)</ent>(l>)", r"\1\2\3", origLine)
newLine = regex.sub(r"([^>])(New\s+?York\s+?Times)([^<])", r"\1<ent type='ORG'>\2</ent>\3", newLine)
newLine = regex.sub(r"([^>])(British)([^<])", r"\1<ent type='NORP'>\2</ent>\3", newLine)
newLine = regex.sub(r"([^>])(New\s+?York(\s+?City)?)([^<])", r"\1<ent type='GPE'>\2</ent>\4", newLine)
newLine = regex.sub(r"<ent type='[A-z]+?'>(Sundays?)</ent>", r"\1", newLine)
newLine = regex.sub(r"<ent type='PERSON'>(North)</ent>([a-z]*\s+[A-Z][a-z]+)*", r"<ent type='LOC'>\1\2</ent>", newLine)
newLine = regex.sub(r"(North\s+o?f?\s*([A-Z][a-z]+\s+)+)", r"<ent type='LOC'>\1</ent>", newLine)
# newLine = regex.sub(r"(<)<ent type='ORG'>(di)</ent>(v>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)