I have this below code which I'm using to convert word documents into txt. Code is good for .docx documents but for .doc below code is working okay in one system but is giving "antiword" error when I'm running this code on another system. "Both are windows 11 systems"
def doc2txt(input_report_path, final_report_path):
file_extension = os.path.splitext(input_report_path)[1]
try:
if file_extension.lower() == ".docx":
# Extract text from .docx file using python-docx
doc = Document(input_report_path)
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
text = "\n".join(paragraphs)
elif file_extension.lower() == ".doc":
# Extract text from .doc file using textract
text = textract.process(input_report_path, encoding='utf-8').decode('utf-8')
with open(final_report_path, "w", encoding="utf-8") as txt_file:
txt_file.write(text)
except Exception as e:
print(f"Error converting {input_report_path}: {e}")