I'm using this Python script to convert .XML wordpress files to .txt. This works fine for blogposts, however other post types it doesn't like.
I've changed around some code, but it still isn't working as intended for other post types then blogposts. This is the code i currently have:
#!/usr/bin/env python
"""This script converts WXR file to a number of plain text files.
WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.
Usage: wxr2txt.py filename [-o output_dir]
"""
import os
import re
import sys
from xml.etree import ElementTree
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"
def main(argv):
filename, output_dir = _parse_and_validate_output(argv)
try:
data = ElementTree.parse(filename).getroot()
except ElementTree.ParseError:
_error("Invalid input file format. Can not parse the input.")
page_counter, post_counter = 0, 0
for post in data.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
content = post.find('content:encoded', namespaces=NAMESPACES).text
date = post.find('wp:post_date', namespaces=NAMESPACES).text
title = post.find('title').text
date = date.split(' ')[0].replace('-', '')
title = re.sub(r'[_]+', '_', re.sub(r'[^a-z0-9+]', '_', title.lower()))
if post_type == 'post':
post_filename = date + '_' + title + '.txt'
post_counter += 1
else:
post_filename = title + '.txt'
page_counter += 1
with open(os.path.join(output_dir, post_filename), 'w') as post_file:
post_file.write(content.encode('utf8'))
post_counter += 1
print("Saved {} posts and {} pages in directory '{}'.".format(
post_counter, page_counter, output_dir))
def _parse_and_validate_output(argv):
if len(argv) not in (2, 4):
_error("Wrong number of arguments.")
filename = argv[1]
if not os.path.isfile(filename):
_error("Input file does not exist (or not enough permissions).")
output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
if not os.path.isdir(output_dir):
_error("Output directory does not exist (or not enough permissions).")
return filename, output_dir
def _error(text):
print(text)
print(USAGE_STRING)
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)
When executing the script, the following errors pops up in the commandprompt:
Traceback (most recent call last):
File "C:\Users\suppo\Desktop\python\script.py", line 71, in <module>
main(sys.argv)
File "C:\Users\suppo\Desktop\python\script.py", line 46, in main
post_file.write(content.encode('utf8'))
TypeError: write() argument must be str, not bytes
So i know i have to encode/decode the content variable. However i really can't seem to figure out how to do this in this script. Can someone point me in the right direction? :)