I've come across a lot of similar questions. However, the answers provided seemed not to be helpful to me.
I'm trying to run a Topic Modeling analysis on an 8000'ish media articles. But I'm getting this error:
Traceback (most recent call last):
File "extract.py", line 23, in <module>
if re.compile('^(.*?) - \d{2} [a-zA-Z]{3}. \d{4}$').match(lines[1]):
IndexError: list index out of range
line 23 where referred to, is this:
if re.compile('^(.*?) - \d{2} [a-zA-Z]{3}. \d{4}$').match(lines[1]):
media = lines[1].split(' - ')[0].replace('*', '')
article = article.replace('\n' + lines[1], '')
if article.find(media) > -1:
containsMediaName.write(filename + '\n')
Can anyone help me ignoring this error somehow?
full code
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
import string
import textract
import unicodedata
from unidecode import unidecode
if not os.path.isdir('./raw'):
os.mkdir('./raw')
names = open('./deleted-names.txt', 'w')
containsMediaName = open('./contains-media-name.txt', 'w')
for filename in os.listdir('./data'):
article = unidecode(textract.process('./data/' + filename).decode('utf-8'))
article = re.sub('<<', '', article)
article = re.sub('>>', '', article)
lines = article.split('\n')
if re.compile('^(.*?) - \d{2} [a-zA-Z]{3}. \d{4}$').match(lines[1]):
media = lines[1].split(' - ')[0].replace('*', '')
article = article.replace('\n' + lines[1], '')
if article.find(media) > -1:
containsMediaName.write(filename + '\n')
if re.match('^Pagina \d{1,5}$', lines[2]):
article = article.replace('\n' + lines[2], '')
article = re.sub('\nCopyright(.*?)Alle rechten voorbehouden\n', '\n', article)
article = re.sub('\n\(Foto:(.*?)\)\n', '\n', article)
article = re.sub('\n\(Fotograaf:(.*?)\)\n', '\n', article)
article = article.strip().rstrip(' \t\r\n\0')
lines = article.split('\n')
name = lines.pop()
if len(name.split(' ')) <= 3:
article = re.sub('\n' + name, '', article)
names.write(name + ',' + filename + '\n')
initials = '('
for namePart in name.split(' '):
initials += namePart[0]
initials += ')'
article = article.strip()
if(article.endswith(initials)):
article = re.sub(re.escape(initials), '', article)
article = article.strip().rstrip(' \t\r\n\0')
f = open('./raw/' + filename + '.txt', 'w')
f.write(article)
f.close()
names.close()
containsMediaName.close()