mirror of
https://github.com/nhammer514/textfiles-politics.git
synced 2025-06-08 14:52:38 -04:00
cleaning files
This commit is contained in:
parent
50314b5f26
commit
9acc589f06
679 changed files with 7970 additions and 7452 deletions
22
pythonCode/scrap-files/stuff.py
Normal file
22
pythonCode/scrap-files/stuff.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
import spacy
|
||||
from collections import Counter
|
||||
import re as regex
|
||||
import os
|
||||
from saxonche import PySaxonProcessor
|
||||
|
||||
nlp = spacy.load("en_core_web_lg")
|
||||
|
||||
def entitycollector(tokens):
|
||||
# creates a new file that includes all of the found entities.
|
||||
with open('conspPERSONentityCollector.txt', 'w') as f:
|
||||
entities = {}
|
||||
# goes through each entity in the token list.
|
||||
for ent in sorted(tokens.ents):
|
||||
entityInfo = [ent.text, ent.label_]
|
||||
stringify = str(entityInfo)
|
||||
f.write(stringify)
|
||||
f.write('\n')
|
||||
entities[ent.text] = ent.label_
|
||||
# return all entities with its label and text.
|
||||
return entities
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue