-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
163 lines (133 loc) · 5.87 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import ssl
import re
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.data.path.append("nltk_data")
# Disable SSL certificate verification
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# Now download the necessary NLTK resources
nltk.download('punkt', download_dir="nltk_data")
# Load stop words lists
stop_words_files = [
'StopWords_Auditor.txt',
'StopWords_Currencies.txt',
'StopWords_DatesandNumbers.txt',
'StopWords_Generic.txt',
'StopWords_GenericLong.txt',
'StopWords_Geographic.txt',
'StopWords_Names.txt'
]
stop_words = set()
for file in stop_words_files:
try:
with open(os.path.join('StopWords', file), 'r', encoding='utf-8') as f:
stop_words.update(f.read().splitlines())
except UnicodeDecodeError:
print("Failed to decode file with UTF-8 encoding. Trying alternative encodings...")
# Load master dictionary
positive_words = set()
negative_words = set()
try:
with open(os.path.join('MasterDictionary', 'positive-words.txt'), 'r', encoding='utf-8') as f:
positive_words.update(f.read().splitlines())
except UnicodeDecodeError:
print("Failed to decode file with UTF-8 encoding. Trying alternative encodings...")
try:
with open(os.path.join('MasterDictionary', 'negative-words.txt'), 'r', encoding='utf-8') as f:
negative_words.update(f.read().splitlines())
except UnicodeDecodeError:
print("Failed to decode file with UTF-8 encoding. Trying alternative encodings...")
# Function to count syllables in a word
def count_syllables(word):
word = word.lower()
count = 0
vowels = "aeiouy"
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index - 1] not in vowels:
count += 1
if word.endswith("e"):
count -= 1
if word.endswith("le"):
count += 1
if count == 0:
count += 1
return count
def clean_text(text):
# Tokenize text
tokens = word_tokenize(text.lower())
# Remove stop words
cleaned_tokens = [word for word in tokens if word not in stop_words]
return cleaned_tokens
# Function to calculate readability scores
def calculate_readability_scores(text):
sentences = sent_tokenize(text)
words = [word for sentence in sentences for word in word_tokenize(sentence)]
total_words = len(words)
total_sentences = len(sentences)
total_syllables = sum(count_syllables(word) for word in words)
complex_words = [word for word in words if count_syllables(word) > 2]
total_complex_words = len(complex_words)
total_personal_pronouns = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text, re.IGNORECASE))
average_sentence_length = total_words / total_sentences
percentage_complex_words = (total_complex_words / total_words) * 100
fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
average_word_length = sum(len(word) for word in words) / total_words
average_words_per_sentence = total_words / total_sentences
return average_sentence_length, percentage_complex_words, fog_index, average_word_length, average_words_per_sentence, total_complex_words, total_syllables, total_personal_pronouns
def calculate_scores(tokens):
positive_score = sum(1 for word in tokens if word in positive_words)
negative_score = sum(1 for word in tokens if word in negative_words)
polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
return positive_score, negative_score, polarity_score, subjectivity_score
def main():
# Read input data structure
output_structure_df = pd.read_excel('Output Data Structure.xlsx')
output_data = []
for index, row in output_structure_df.iterrows():
url_id = row['URL_ID']
url = row['URL']
# Read extracted article text
try:
with open(os.path.join('extracted_articles', f"{url_id}.txt"), 'r', encoding='utf-8') as file:
text = file.read()
except FileNotFoundError:
print(f"File '{url_id}.txt' not found in 'extracted_articles' folder.")
continue # Пропустить этот файл и перейти к следующему итерации цикла.
# Calculate readability scores
average_sentence_length, percentage_complex_words, fog_index, average_word_length, average_words_per_sentence, total_complex_words, total_syllables, total_personal_pronouns = calculate_readability_scores(
text)
# Calculate sentiment scores
cleaned_tokens = clean_text(text)
positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(cleaned_tokens)
output_data.append({
'URL_ID': url_id,
'URL': url,
'POSITIVE SCORE': positive_score,
'NEGATIVE SCORE': negative_score,
'POLARITY SCORE': polarity_score,
'SUBJECTIVITY SCORE': subjectivity_score,
'AVERAGE SENTENCE LENGTH': average_sentence_length,
'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
'FOG INDEX': fog_index,
'AVERAGE WORD LENGTH': average_word_length,
'AVERAGE WORDS PER SENTENCE': average_words_per_sentence,
'COMPLEX WORD COUNT': total_complex_words,
'SYLLABLE COUNT PER WORD': total_syllables,
'PERSONAL PRONOUNS COUNT': total_personal_pronouns
})
# Create DataFrame from output data and save to Excel
output_df = pd.DataFrame(output_data)
output_df.to_excel('output_results.xlsx', index=False)
if __name__ == "__main__":
main()