Twitter v2 APIs include an endpoint for random sampling and endpoint for filtered tweets.
The latter allows for specifying a random sample percentage in a query (for example, sample:10 will return a random 10% sample).
Note that v2 APIs are still new and at the moment have a cap of 500k tweets per month.
As an example for the latter, the following code (modified version of this, see this doc) will collect streaming data with cat or dog tags and store it in a json file for every 100 tweets. (Note: this does not include the random sampling query.)
import requests
import os
import json
import pandas as pd
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
data = []
counter = 0
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def get_rules(headers, bearer_token):
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
)
if response.status_code != 200:
raise Exception(
"Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
)
print(json.dumps(response.json()))
return response.json()
def delete_all_rules(headers, bearer_token, rules):
if rules is None or "data" not in rules:
return None
ids = list(map(lambda rule: rule["id"], rules["data"]))
payload = {"delete": {"ids": ids}}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
headers=headers,
json=payload
)
if response.status_code != 200:
raise Exception(
"Cannot delete rules (HTTP {}): {}".format(
response.status_code, response.text
)
)
print(json.dumps(response.json()))
def set_rules(headers, delete, bearer_token):
# You can adjust the rules if needed
sample_rules = [
{"value": "dog has:images", "tag": "dog pictures"},
{"value": "cat has:images -grumpy", "tag": "cat pictures"},
]
payload = {"add": sample_rules}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
headers=headers,
json=payload,
)
if response.status_code != 201:
raise Exception(
"Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
)
print(json.dumps(response.json()))
def get_stream(headers, set, bearer_token):
global data, counter
response = requests.get(
"https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True,
)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Cannot get stream (HTTP {}): {}".format(
response.status_code, response.text
)
)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
print(json.dumps(json_response, indent=4, sort_keys=True))
data.append(json_response['data'])
if len(data) % 100 == 0:
print('storing data')
pd.read_json(json.dumps(data), orient='records').to_json(f'tw_example_{counter}.json', orient='records')
data = []
counter +=1
def main():
bearer_token = os.environ.get("BEARER_TOKEN")
headers = create_headers(bearer_token)
rules = get_rules(headers, bearer_token)
delete = delete_all_rules(headers, bearer_token, rules)
set = set_rules(headers, delete, bearer_token)
get_stream(headers, set, bearer_token)
if __name__ == "__main__":
main()
Then, load data in pandas dataframe as
df = pd.read_json('tw_example.json', orient='records')
.