I forked this repo in order to make my discourse site into a static one. But, I keep getting this message:
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
It works for most posts, so it could be because the forum I am trying to archive is too large.
This line is most likely causing trouble posts_json = response.json()['post_stream']['posts']
. Any help would be greatly appreciated!
EDIT: I added print(response.raise_for_status())
and got this message
requests.exceptions.HTTPError: 429 Client Error: Too Many Requests for url: https://www.phylobabble.org//t/matrix-data-types/219.json
I added time.sleep(1)
and now it works!
# Function that writes out each individual topic page
def write_topic(topic_json):
topic_download_url = base_url + '/t/' + topic_json['slug'] + '/' + str(topic_json['id'])
topic_relative_url = 't/' + topic_json['slug'] + '/' + str(topic_json['id'])
try:
os.makedirs(topic_relative_url)
except Exception as err:
print ('in write_topic error:', 'make directory')
response = requests.get(topic_download_url + '.json', cookies=jar)
# posts_json will contain only the first 20 posts in a topic
posts_json = response.json()['post_stream']['posts']
# posts_stream will grab all of the post ids for that topic
posts_stream = response.json()['post_stream']['stream']
# get rid of first 20 in stream, as they are already in posts_json
posts_stream = posts_stream[20:]
# break stream into a list of list chunks of n posts each for lighter requests
n = 9999999
chunked_posts_stream = [posts_stream[i * n:(i + 1) * n] for i in range((len(posts_stream) + n - 1) // n)]
posts_download_url = base_url + '/t/' + str(topic_json['id']) + '/posts.json?'
# make a request for the content associated with each post id
# chunk and append it to the posts_json list
for chunk in chunked_posts_stream:
formatted_posts_list = ""
for post_id in chunk:
formatted_posts_list = formatted_posts_list + 'post_ids[]=' + str(post_id) + '&'
response = requests.get(posts_download_url + formatted_posts_list, cookies=jar)
posts_2_json = response.json()['post_stream']['posts']
posts_json.extend(posts_2_json)
# generate that HTML
post_list_string = ""
for post_json in posts_json:
post_list_string = post_list_string + post_row(post_json)
topic_file_string = topic_template \
.replace("<!-- TOPIC_TITLE -->", topic_json['fancy_title']) \
.replace("<!-- JUST_SITE_TITLE -->", str(site_title.text)) \
.replace("<!-- ARCHIVE_BLURB -->", archive_blurb) \
.replace("<!-- POST_LIST -->", post_list_string)
f = open(topic_relative_url + '/index.html', 'w')
f.write(topic_file_string)
f.close()