0

I am using the full archive search of the Twitter API to extract data on past events. I have downloaded the code sample and modified it a bit to also save my data to a file on my local drive, and this is all working well. But I do not know how to implement pagination.

When working with tweepy, there is a special .pages() function, but my current script uses requests.

I tried adding a while loop in my main function using ["next_token"], but I did not really understand the Twitter documentation and could not make it work.

Here is what I have got so far:

# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research

import requests
import os
import json

# STEP 1: add bearer token for your academic Twitter API dev account

bearer_token = "MYTOKEN"

# STEP 2: define which API endpoint to query: "all" or "recent"

search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: 
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields

# STEP 3: define query parameters

query_params = {'query': '#WEURO2022',
                'tweet.fields': 'author_id,conversation_id,created_at',
                'expansions': 'geo.place_id',
                'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
                'user.fields': 'created_at,description,entities,id,location,name',
                'start_time': '2022-02-15T00:00:01.000Z',
                'end_time': '2022-09-16T23:59:59.000Z',
                'max_results':'500'}

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code == 200:
        print("Ready to go!")
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_data=response.json()
    # return json_data
    
# write data to JSON file
    with open('C:\\Users\\####\\Downloads\\MyTweets.json', 'a') as json_f:
        json.dump(json_data, json_f)
        print("JSON data written to file!")
            
def main():  
    json_response = connect_to_endpoint(search_url, query_params)
    while json_response["meta"]["next_token"]:
        query_params["next_token"] = json_response["meta"]["next_token"]

if __name__ == "__main__":
    main()

Can you help me fix this, or point me to a tutorial for less experienced users?

OnceUponATime
  • 450
  • 4
  • 12

1 Answers1

0

I have found a way to fix my pagination issue, but I dare say that it is a most un-elegant solution:

# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research

import requests
import os
import json
import urllib

# STEP 1: add bearer token for your academic Twitter API dev account

bearer_token = "MYTOKEN"

# STEP 2: define which API endpoint to query: "all" or "recent"

search_url = "https://api.twitter.com/2/tweets/search/all"
token_url= "https://api.twitter.com/2/tweets/search/all?next_token="

# Optional params: 
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields

# STEP 3: define query parameters and number of pages to be retrieved

# query example: (from:twitterdev -is:retweet) OR #twitterdev

query_params = {'query': '((from:Eurovision) OR #esc OR #esc2018) (#ISR OR #NettaBarzilai OR @NettaBarzilai) lang:en',
                'tweet.fields': 'author_id,conversation_id,created_at',
                'expansions': 'geo.place_id',
                'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
                'user.fields': 'created_at,description,entities,id,location,name',
                'start_time': '2018-02-15T00:00:01.000Z',
                'end_time': '2018-07-16T23:59:59.000Z',
                'max_results':'500'}

pages = 20

token_list=[]

# STEP 4: authenticate

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

# STEP 5: connect to endpoint and run query

def connect_to_endpoint(url, params, next_token):
    try:
        if (len(token_list[-1]) >= 1):
            next_token=token_list[-1]
            target=[token_url, str(next_token)]
            url="".join(target)
            print(url)
        else:
            url = search_url
            print(url)
    except IndexError:
        url = search_url
        print(url)
    
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code == 200:
        print("Ready to go!")
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_data=response.json()
    next_token=json_data["meta"]["next_token"]
    token_list.append(next_token)
    
    print(token_list)
    
# STEP 6: write data to JSON file

    with open('C:\\Users\\xxx\\yyy\\NettaESC2018_tweets.json', 'a') as json_f:
        json.dump(json_data, json_f)
        print("JSON data written to file!")
            
def main():  
    for p in range(0, pages):
        try:
            json_response = connect_to_endpoint(url, query_params, next_token)
        except KeyError:
            print("No more tweets found!")
            break
    
if __name__ == "__main__":
    main()

If anyone has a better suggestion, I am looking forward to it!

OnceUponATime
  • 450
  • 4
  • 12