Apply the code for smaller batches in the data set sequentially

Question

I have data set of retrieved tweets via the Twitter streaming API. However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:

def create_url():
    tweet_fields = "tweet.fields=public_metrics"
    tweets_data_path = 'dataset.txt'
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
    df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
    df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
    ids = "ids=" + df_id
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def main():
    url = create_url()
    json_response = connect_to_endpoint(url)
    print(json.dumps(json_response, indent=3, sort_keys=True))

if __name__ == "__main__":
        main()

Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that? Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?

HTF · Answer 1 · 2021-08-21T05:24:34.027

You can chunk your data and send it in batches using itertools.islice.

test.py:

import reprlib

from itertools import islice

import pandas as pd


BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100

def req(ids):
    tmp = reprlib.repr(ids) # Used here just to shorten the output
    print(f"{BASE_URL}?ids={tmp}")


def main():
    df = pd.DataFrame({"id": range(1000)})
    it = iter(df["id"])

    while chunk := tuple(islice(it, CHUNK)):
        ids = ",".join(map(str, chunk))
        req(ids)


if __name__ == "__main__":
    main()

Test:

$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'

Note: You'll make multiple requests with this approach so keep in mind any rate limits.

Apply the code for smaller batches in the data set sequentially

1 Answers1