2

I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset. Thank You!

Here is my code:

    # Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time

# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''

# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

        
def extract_tweets(search_words,date_since,numTweets):
    return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))

def scrapetweets(search_words, date_since, numTweets, numRuns):
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
    #db_tweets = pd.DataFrame()
    
    for i in range(numRuns):
        
        tweets = extract_tweets(search_words,date_since,numTweets)
        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
        print(len(tweet_list))
        noTweets = 0
        
        for tweet in tweet_list:
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']
            lst=[]
            for h in hashtags:
                lst.append(h['text'])
            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text
            
            itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
            db_tweets.loc[len(db_tweets)] = itweet
        
            noTweets += 1
            print(noTweets,itweet)
            
            #filename = "tweets.csv"
            #with open(filename, "a", newline='') as fp:
             #   wr = csv.writer(fp, dialect='excel')
              #  wr.writerow(itweet)
                
        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        if i+1 != numRuns:
            time.sleep(920)
        
        filename = "tweets.csv"
        # Store dataframe in csv with creation date timestamp
        db_tweets.to_csv(filename, mode='a', index = False)
        
# Initialise these variables:

        
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))

I referred to a blog on medium for this purpose.

1 Answers1

-1

you can add a variable as validator an store it to a file that may be a tweetid.txt

and each time you run the script, you open di tweetid.txt

if tweetid same in tweet id in txt, you pass it.