created main.py with code

2025-08-01 02:26:05 -04:00 · 2023-03-27 17:14:07 -04:00 · 2023-03-27 17:14:07 -04:00 · d18a6f5b16
commit d18a6f5b16
parent 3dfe20153d
2 changed files with 86 additions and 8 deletions
--- a/pythonCode/main.py
+++ b/pythonCode/main.py
@ -0,0 +1,80 @@
+import spacy
+from collections import Counter
+import os
+# Uncomment this line if you need the language model.
+# If you already have it, comment it ou.
+# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm
+workingDir = os.getcwd()
+CollPath = os.path.join(workingDir, '../regexConsp')
+insideDir = os.listdir(CollPath)
+print(insideDir)
+
+nlp = spacy.load("en_core_web_lg")
+def readTextFiles(filepath):
+    with open(filepath, 'r', encoding='utf8') as f:
+        readFile = f.read()
+        # print(readFile)
+        stringFile = str(readFile)
+        # lengthFile = len(readFile)
+        # print(lengthFile)
+        tokens = nlp(stringFile)
+        # print(tokens)
+        listEntities = entitycollector(tokens)
+        print(listEntities)
+        # cardinal_freq = Counter(listCardinals)
+        # topTen = cardinal_freq.most_common(10)
+        # print(topTen)
+
+
+
+def entitycollector(tokens):
+    entities = []
+    for entity in tokens.ents:
+        # if entity.label_ == "CARDINAL":
+        print(entity.text, entity.label_, spacy.explain(entity.label_))
+        entities.append(entity.text)
+    return entities
+
+
+for file in os.listdir(CollPath):
+    if file.endswith(".xml"):
+        filepath = f"{CollPath}/{file}"
+        print(filepath)
+        readTextFiles(filepath)
+
+
+
+
+
+# print(listCardinals)
+# cardinal_freq = Counter(listCardinals)
+# topTen = cardinal_freq.most_common(10)
+# print(topTen)
+
+# grimm = open('grimm.txt', 'r', encoding='utf8')
+# grimmDoc = grimm.read()
+# grimmNLP = nlp(grimmDoc)
+# grimmmSentences = grimmNLP.sents
+
+# def sentenceLengths(sentences):
+#     lengths = []
+#     for s in sentences:
+#         length = len(s.text)
+#         lengths.append(length)
+#     return sorted(lengths)
+
+
+# grimmLengths = sentenceLengths(grimmmSentences)
+# # print(grimmLengths)
+# maxVal = max(grimmLengths)
+# minVal = min(grimmLengths)
+# print('The shortest sentence is ' + str(minVal) + ' characters long.')
+# print('The longest sentence is ' + str(maxVal) + ' characters long.')
+
+# for sentence in grimmNLP.sents:
+# #    print(sentence.text)
+#     length = len(sentence.text)
+#     if length == minVal:
+#         print("The shortest sentence is: " + sentence.text)
+#     if len(sentence.text) == maxVal:
+#         print('The longest sentence is: ' + sentence.text + ' :' + str(maxVal) + 'characters')
--- a/pythonCode/test.py
+++ b/pythonCode/test.py
@ -1,8 +1,6 @@
-home = C:\Users\Nathan\AppData\Local\Programs\Python\Python311
-implementation = CPython
-version_info = 3.11.1.final.0
-virtualenv = 20.16.7
-include-system-site-packages = false
-base-prefix = C:\Users\Nathan\AppData\Local\Programs\Python\Python311
-base-exec-prefix = C:\Users\Nathan\AppData\Local\Programs\Python\Python311
-base-executable = C:\Users\Nathan\AppData\Local\Programs\Python\Python311\python.exe
+import os
+
+workingDir = os.getcwd()
+CollPath = os.path.join(workingDir, '../regexConsp')
+insideDir = os.listdir(CollPath)
+print(insideDir)