diff --git a/pythonCode/main.py b/pythonCode/main.py index 5ca6a2c..1ed4d70 100644 --- a/pythonCode/main.py +++ b/pythonCode/main.py @@ -30,50 +30,79 @@ def copyTextFiles(file): # Function runs through the tokens of given file. Entities are stored in array, then returned. Called by regexFile(). def entitycollector(tokens): + # creates a new file that includes all of the found entities. with open('output.txt', 'w') as f: entities = {} + # goes through each entity in the token list. for ent in sorted(tokens.ents): - # if entity.label_ == "NORP" or entity.label_ == "LOC" or entity.label_=="GPE": - # ebb: The line helps experiment with different spaCy named entity classifiers, in combination if you like: - # When using it, remember to indent the next lines for the for loop. - # print(entity.text, entity.label_, spacy.explain(entity.label_)) entityInfo = [ent.text, ent.label_, spacy.explain(ent.label_)] stringify = str(entityInfo) f.write(stringify) f.write('\n') - # PRINT TO FILE - # entities.append(entity.text) entities[ent.text] = ent.label_ + # return all entities with its label and text. return entities # Function runs regex through given file. def regexFile(file): fileDir = os.path.join(outputPath, file) with PySaxonProcessor(license=False) as proc: - # grabs the original xml file and stores it in a variable for later. + # grabs the original xml file and stores it in a variable for later. this some xquery bs xml = open(fileDir, encoding='utf-8').read() xp = proc.new_xpath_processor() node = proc.parse_xml(xml_text=xml) xp.set_context(xdm_item=node) + + # xquery goes through original text, and stores it all in a single string. xpath = xp.evaluate('//p ! normalize-space() => string-join()') string = xpath.__str__() + + # regex goes through the text and deletes anything that is not a letter or space. cleanedText = regex.sub('[^A-z]+', ' ', string) + + # gets the tokens of the clean text. tokens = nlp(cleanedText) + wrappedText = xml + # grabs all the entities in file and stores it in a list/array. listEntities = entitycollector(tokens) - #print(listEntities) + # if anything exists in the list, the following code will run. if listEntities: + # it will check through each entity in the list and see its entity type. it is looking for "PERSON" tokens + # in this instance, which includes of nouns and names. for entity in listEntities.keys(): - #print(entity, listEntities[entity]) if listEntities[entity] == "PERSON": + # key_template variable is the elements we wrap around found instances. key_template = "" + entity + "" + # loops through wrappedText until all entities are wrapped. wrappedText = wrappedText.replace(entity, key_template) - # Saves newly wrapped elements and then writes it into the copied file. + # Saves newly wrapped elements and then writes it into new file. with open(fileDir, 'w', encoding='utf8') as f: f.write(wrappedText) print("WRAPPING " + entity) +# This part of the code does not run. It is a WIP. +## It tries to find weird or invalid elements/tags and fix them. +def checkTags(file): + content = [] + fileDir = os.path.join(outputPath, file) + + with open(fileDir, 'r', encoding='utf8') as inFile: + for line in inFile: + content.append(line) + # With the contents copied, a loop will go through the array and write it all in a new file in output folder. + with open(fileDir, 'w', encoding='utf8') as f: + for line in content: + match1 = regex.search("(){2,}(.+?)", line) + if match: + print("broken line found, fixing...") + newLine = regex.sub("(){2,}(.+?)",r"\1 \2",line) + print(line + "\n INTO.") + print(newLine) + + for file in insideDir: copyTextFiles(file) regexFile(file) + #checkTags(file) print("File checking finished.") \ No newline at end of file