mirror of
https://github.com/nhammer514/textfiles-politics.git
synced 2025-05-06 16:45:03 -04:00
fresh python-tagged ents from personTagger.py
This commit is contained in:
parent
22a4906740
commit
0d68297335
331 changed files with 37694 additions and 34577 deletions
File diff suppressed because it is too large
Load diff
|
@ -29,13 +29,14 @@ ruler = nlp.add_pipe("span_ruler", before="ner", config=config)
|
|||
patterns = [
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^-\w+?"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.$"}}]},
|
||||
# ebb: Don't match on any single characters!
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "\^+"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^\w\w$"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^[a-z]+\s+[a-z]+$"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?__{2,}.*?$"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN(ITY|DOM)?"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "CHRISTIAN\s+NETWORK"}}]},
|
||||
# ebb: Don't match on any single characters!
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Catholic(ism)?"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[A-Z]{2,}[A-Z][a-z]+"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "[a-z]{2,}[A-Z][a-z]+"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "^.*?[a-z][A-Z].*?$"}}]},
|
||||
|
@ -771,7 +772,7 @@ patterns = [
|
|||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Lop Nor"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Gross Wannsee"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Groom Lake"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "North\s+?American"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "North\s+?[A-Z][a-z]+"}}]},
|
||||
{"label": "NORP", "pattern": [{"TEXT": {"REGEX": "Monk"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Northern"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Allanwood"}}]},
|
||||
|
@ -790,12 +791,12 @@ patterns = [
|
|||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Columbia"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "FORT\s+?HUNT"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Butte"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "State\s+?College"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "State\s*College"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Williamsport"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Landsdale"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Newtown\s+?Square"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Newtown\s*Square"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Allentown"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "New\s+?Castle"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "New\s*Castle"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Beckley"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Alton"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Stubenville"}}]},
|
||||
|
@ -819,25 +820,24 @@ patterns = [
|
|||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Albuquerque"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Albany"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "CORAL\s+?GABLES"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington\s+?DC"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,\s+?DC"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,\s+?D\.C\."}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Washington,?\s*D\.?C\.?"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Avon\s+?Park"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Mill\s+?Point"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "El\s+?Reno"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "Zagreb"}}]},
|
||||
{"label": "GPE", "pattern": [{"TEXT": {"REGEX": "ZAGREB"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Whiskey\s+?Flat"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Whiskey\s*Flat"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "site\s+?S-4"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "LUNA"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "AREA\s+?51"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "El\s+?Salvador\s+?air\s+?base"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Greenville\s+?County"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Toyland"}}]},
|
||||
{"label": "NULL", "pattern": [{"TEXT": {"REGEX": "Sunday"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Sun"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Pole"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Qua\s+?Vieaf"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Viet\s+?Nam"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "North\s+?Viet\s*[Nn]am"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Owl's\s+?Nest"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Dachau"}}]},
|
||||
{"label": "LOC", "pattern": [{"TEXT": {"REGEX": "Hill\s+?Billies"}}]},
|
||||
|
@ -1043,8 +1043,8 @@ patterns = [
|
|||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "KRLL"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "KRLLL"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EBE"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller\s+?III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+?D\.\s+?Rockefeller\s+?III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller\s*III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s*?D\.\s*Rockefeller\s+?III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Dennis\s+?DeConcini"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Phil\s+?Gramm"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "BILL\s+?HAMILTON"}}]},
|
||||
|
@ -1054,13 +1054,13 @@ patterns = [
|
|||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Rockefeller"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Johnston"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Dodd"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Laurence\s+Rockefeller"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller\s+IV"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s+Rockefeller,\s+Jr\."}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "William\s+?H\.\s+?Draper\s+?III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Laurence\s*Rockefeller"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller\s*IV"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+D\.\s*Rockefeller,\s*Jr\."}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "William\s*H\.\s*Draper\s*III"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Eduardo"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EARL\s+?W\.\s+?BRIAN"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "EARL\s*W\.\s+?BRIAN"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "RICONOSCIUTO"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ramakrishna"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Don\s+?Ecker"}}]},
|
||||
|
@ -1099,6 +1099,8 @@ patterns = [
|
|||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Boniface\s+?VI"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Gonda"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ollie\s+?North"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Oliver\s*North"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Mr\.\s*North"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "E\.\s+?Howard\s+?Hunt"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "MLK"}}]},
|
||||
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "RFK"}}]},
|
||||
|
@ -1429,7 +1431,7 @@ patterns = [
|
|||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?National\s+?Association\s+?of\s+?Scholars"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Council\s+?on\s+?Foreign\s+?Relations"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Symbionese\s+?Liberation\s+?Army"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Washington\s+?Times"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Washington\s+?Post"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Internal\s+?Revenue\s+?Service"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Terminate\s+?With\s+?Extreme\s+?Prejudice"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Center\s+?for\s+?Strategic\s+?and\s+?International\s+?Studies"}}]},
|
||||
|
@ -1438,6 +1440,7 @@ patterns = [
|
|||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Centre\s+?of\s+?Eternity"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Life\s+?magazine"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Full\s+?Disclosure\s+?Newspaper"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "North\s+American\s+Newspaper\s+Alliance"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Trilateral\s+?Commision"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?Ruling\s+?Elite"}}]},
|
||||
{"label": "ORG", "pattern": [{"TEXT": {"REGEX": "([Tt]he\s+?)?US\s+?Congress"}}]},
|
||||
|
@ -1622,6 +1625,13 @@ def checkTags(file):
|
|||
# newLine = regex.sub(r"(<ent type='.+?'>[^<>]*?)<ent[^>]+?>([^<>]+?)</ent>([^<>]*?</ent>)", r"\1\2\3",line)
|
||||
# <spe<ent type='ORG'>cia</ent>l>
|
||||
newLine = regex.sub(r"(</?spe)<ent type='ORG'>(cia)</ent>(l>)", r"\1\2\3", origLine)
|
||||
newLine = regex.sub(r"([^>])(New\s+?York\s+?Times)([^<])", r"\1<ent type='ORG'>\2</ent>\3", newLine)
|
||||
newLine = regex.sub(r"([^>])(British)([^<])", r"\1<ent type='NORP'>\2</ent>\3", newLine)
|
||||
newLine = regex.sub(r"([^>])(New\s+?York(\s+?City)?)([^<])", r"\1<ent type='GPE'>\2</ent>\4", newLine)
|
||||
newLine = regex.sub(r"<ent type='[A-z]+?'>(Sundays?)</ent>", r"\1", newLine)
|
||||
newLine = regex.sub(r"<ent type='PERSON'>(North)</ent>([a-z]*\s+[A-Z][a-z]+)*", r"<ent type='LOC'>\1\2</ent>", newLine)
|
||||
newLine = regex.sub(r"(North\s+o?f?\s*([A-Z][a-z]+\s+)+)", r"<ent type='LOC'>\1</ent>", newLine)
|
||||
|
||||
# newLine = regex.sub(r"(<)<ent type='ORG'>(di)</ent>(v>)", r"\1\2\3", newLine)
|
||||
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
||||
newLine = regex.sub(r"(<ent type='[A-z]+?'>[^<]*?)<ent type='[A-z]+?'>([^<]+?)</ent>([^<]*?</ent>)", r"\1\2\3", newLine)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue