I am using stackexchange api to get comments from 2000 to 2019 August. It looks like I only iterate through 2 pages. I am not sure whether my mistake is in the api parameters or in the iteration process.
This is my code looks like.
import requests
from datetime import datetime
import json
import csv
import os
import pprint
pp = pprint.PrettyPrinter(indent=4)
def write_to_json(data):
curr_dir = os.getcwd()
output_file_path = os.path.join(curr_dir, 'so_comment1.json')
with open(output_file_path, 'w') as outfile:
json.dump(data, outfile)
def get_comments(fromdate, todate):
so_url = 'https://api.stackexchange.com/2.2/comments?site=stackoverflow&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100'
headers = {"Content-type": "application/json"}
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data = resp.json()
data1 = resp.json()
page_num = 1
if data1['has_more']:
page_num += 1
so_url = 'https://api.stackexchange.com/2.2/comments?site=stackoverflow&filter=!1zSn*g7xPU9g6(VDTS7_c&fromdate=' \
+str(fromdate)+'&todate='+str(todate)+'&pagesize=100&page='+str(page_num)
resp = requests.get(so_url, headers = headers)
if resp.status_code != 200:
print('error: ' + str(resp.status_code))
else:
print('Success')
data1 = resp.json()
for item in data1['items']:
data['items'].append(item)
write_to_json(data)
def filter_comment_body():
with open('so_comment1.json') as json_file_so:
comments = json.load(json_file_so)
with open('comments1.csv', 'w', encoding='utf-8') as comments_file:
comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for item in comments['items']:
comments_writer.writerow([item['body']])
if __name__ == '__main__':
# once comments are written to json file(s) stop calling to get_comments
fromdate = datetime.strptime('Jan 1 2000', '%b %d %Y')
todate = datetime.strptime('Aug 1 2019', '%b %d %Y')
# print(datetime.timestamp(fromdate), ' ', datetime.timestamp(todate))
get_comments(fromdate, todate)
filter_comment_body()
Considering the date range I assume that I will get 1000s of comments. But I only received 200 comments (2 pages)