I've scouted around on stackoverflow and different websites but I seem to have not found a solution still. My issue is that I'm trying to access two different folders which contain "ham" or "spam" emails to work into a dataset for model training. I seem to keep getting a permission error and I'm unsure on how to solve it through Python or Windows Explorer. I would like to know how to solve it in multiple ways to get a better understanding of it.
Here's the code:
ham = 'ham'
spam = 'spam'
data = 'emails2'
hamfiles = []
spamfiles = []
'''Searching File Path'''
print('# MESSAGE: Finding for files ----------------------------------------------------------------------------------')
for subdir, folders, files in os.walk(data):
if subdir.__contains__(ham):
# print(subdir)
for file in files:
# print(os.path.join(subdir, file))
hamfiles.append(os.path.join(subdir, file))
else:
for file in files:
# print(os.path.join(subdir, file))
spamfiles.append(os.path.join(subdir, file))
import glob
X_file = []
y_class = []
eof = [('eof')]
for hamfile in hamfiles:
# print(hamfile)
files = glob.glob(hamfile)
for file in files:
# print(file)
h = open(file, encoding='UTF8', errors='replace')
buffer = h.read()
'''Tokenize'''
token = nltk.word_tokenize(buffer)
'''Part Of Speech Tagging'''
posTag = nltk.pos_tag(token)
'''Append to Array'''
for (word, tag) in posTag:
X_file.append(word)
y_class.append('ham')
for spamfile in spamfiles:
# print(spamfile)
files = glob.glob(spamfile)
for file in files:
# print(file)
s = open(file, encoding='UTF8', errors='replace')
buffer = s.read()
'''Tokenize'''
token = nltk.word_tokenize(buffer)
'''Part Of Speech Tagging'''
posTag = nltk.pos_tag(token)
'''Append to Array'''
for (word, tag) in posTag:
X_file.append(word)
y_class.append('spam')
print('# MESSAGE: Print X_ham ----------------------------------------------------------------------------------------')
print(X_file)
h.close()
def create_lexicon(X_file,y_class):
lexicon = []
with open(X_file,'r+') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
with open(y_class,'r+') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
I understand that it could be a windows permission error but I have never encountered this before.