decentralized-id.github.io/twscraper.py

import tweepy
import csv
import pandas as pd

#### Credentials
CONSUMER_KEY = os.environ.get('CONSUMER_KEY')
CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')
ACCESS_KEY = os.environ.get('ACCESS_KEY')
ACCESS_SECRET = os.environ.get('ACCESS_SECRET')


#### Authorization
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

#### Keywords
keywords = ["#verifiablecredentials","#selfsovereignidentity","Self+Sovereign+Identity","Hyperledger+Aries","DIDComm","Key+Event+Receipt+Infrastructure","#ToIP","#TrustoverIP","w3c+Credentials"]

#### Get Date
from datetime import date, timedelta, datetime

#### Find 7 Days Ago
current_date = date.today()   
days_before = (date.today()-timedelta(days=7))
now = datetime.now()
date_time = current_date.strftime("%m%d%y")

#### Open CSV + Write Column Names
fname = 'SSI-DID_' + date_time + '.csv'
csvFile = open(fname, 'w+')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])
lines_seen = []
text_seen = []

for keyword in keywords:
    # Search hashtags\keywords
    for tweet in tweepy.Cursor(api.search,q=keyword + ' -filter:retweets', count=100, tweet_mode="extended", lang="en", since=days_before).items():
        ### Reset Variables
        medias = []
        lnks = []
        replink = ""
        title = []
        description = []
        image = []
        qtid = ''
        qttext = ''
        qtmedia = ['']
        qturls = ['']
        seen = 'FALSE'

        ### Set basic tweet attributes
        retweetcount = tweet.retweet_count
        favorites = tweet.favorite_count
        username = tweet.user.screen_name
        id = "https://twitter.com/" + username + "/status/" + tweet.id_str
        idstr = tweet.id_str
        text = tweet.full_text
        hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]     
        created = str(tweet.created_at)

        #### Only add line to csv if it's not already been added
        if hasattr(tweet, 'quoted_status'):
            quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
            print("Quoted ID " + quotedid)
            if quotedid in lines_seen:
                print("Quoted Status Seen")
                seen = 'TRUE'
        for y in lines_seen:
            if id == y:
                seen = 'TRUE'
        for q in text_seen:
            if text == q:
                seen = 'TRUE'
        if seen == 'TRUE' or username == "Docbasia": continue
        else:
            ### Keep track of seen lines \ tweets
            lines_seen.append(id)
            text_seen.append(text)
            ### Check for reply id
            try:
                reply = tweet.in_reply_to_status_id_str
                user = tweet.in_reply_to_user_id_str
                replink = "https://twitter.com/" + user + "/" + reply
            except:
                print("no reply url")
            ### Check for images in tweet
            if 'media' in tweet.entities:
                for media in tweet.extended_entities['media']:
                    medias.append(media['media_url_https'])
            ### Check for urls in tweet
            if 'urls' in tweet.entities:
                for url in tweet.entities['urls']:
                    lkn = url['expanded_url']
                    lnks.append(url['expanded_url'])
                    ### Look for metadata
                    from webpreview import web_preview
                    ### Unless link is an image pdf twitter or insta
                    if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue
                    else:
                        try:
                        ### get title img description
                            print('>>Getting Link Metadata<<')
                            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
                            tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
                            title.append(tit)
                            description.append(desc)
                            image.append(ima) 
                        except:
                            print("broken link")
                ### If it's a quote-tweet, get original stats
            if hasattr(tweet, 'quoted_status'):
                print("Quoted Status NotSeen")
                qtmedia = ['']
                qturls = ['']
                qttext = tweet.quoted_status.full_text
                qtuser = tweet.quoted_status.user.screen_name
                qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
                if 'media' in tweet.quoted_status.entities:
                    for media in tweet.quoted_status.extended_entities['media']:
                        qtmedia.append(media['media_url_https'])
                if 'urls' in tweet.quoted_status.entities:
                    for url in tweet.quoted_status.entities['urls']:
                        qturls.append(url['expanded_url'])
            #### Column attributes
            line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]

            #### Write row to CSV and print line
            csvWriter.writerow(line)
            print(line)
csvFile.close()
print("Complete")
add tweet scraper 2020-10-28 21:39:54 -04:00			`import tweepy`
			`import csv`
			`import pandas as pd`

			`#### Credentials`
			`CONSUMER_KEY = os.environ.get('CONSUMER_KEY')`
			`CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')`
			`ACCESS_KEY = os.environ.get('ACCESS_KEY')`
			`ACCESS_SECRET = os.environ.get('ACCESS_SECRET')`

update scraper 2020-12-05 15:17:49 -05:00
add tweet scraper 2020-10-28 21:39:54 -04:00			`#### Authorization`
			`auth = tweepy.OAuthHandler(consumer_key, consumer_secret)`
			`auth.set_access_token(access_token, access_token_secret)`
			`api = tweepy.API(auth,wait_on_rate_limit=True)`

			`#### Keywords`
update scraper 2020-12-05 15:17:49 -05:00			`keywords = ["#verifiablecredentials","#selfsovereignidentity","Self+Sovereign+Identity","Hyperledger+Aries","DIDComm","Key+Event+Receipt+Infrastructure","#ToIP","#TrustoverIP","w3c+Credentials"]`
add tweet scraper 2020-10-28 21:39:54 -04:00
			`#### Get Date`
			`from datetime import date, timedelta, datetime`

			`#### Find 7 Days Ago`
			`current_date = date.today()`
			`days_before = (date.today()-timedelta(days=7))`
			`now = datetime.now()`
			`date_time = current_date.strftime("%m%d%y")`

			`#### Open CSV + Write Column Names`
update scraper 2020-12-05 15:17:49 -05:00			`fname = 'SSI-DID_' + date_time + '.csv'`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`csvFile = open(fname, 'w+')`
add tweet scraper 2020-10-28 21:39:54 -04:00			`csvWriter = csv.writer(csvFile)`
update scraper 2020-12-05 15:17:49 -05:00			`csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])`
remove printlines, doubles, don't scrape twitter links 2020-10-29 17:32:18 -04:00			`lines_seen = []`
update scraper 2020-12-05 15:17:49 -05:00			`text_seen = []`
add tweet scraper 2020-10-28 21:39:54 -04:00
			`for keyword in keywords:`
remove printlines, doubles, don't scrape twitter links 2020-10-29 17:32:18 -04:00			`# Search hashtags\keywords`
update scraper 2020-12-05 15:17:49 -05:00			`for tweet in tweepy.Cursor(api.search,q=keyword + ' -filter:retweets', count=100, tweet_mode="extended", lang="en", since=days_before).items():`
			`### Reset Variables`
add tweet scraper 2020-10-28 21:39:54 -04:00			`medias = []`
			`lnks = []`
			`replink = ""`
			`title = []`
			`description = []`
			`image = []`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`qtid = ''`
			`qttext = ''`
			`qtmedia = ['']`
			`qturls = ['']`
update scraper 2020-12-05 15:17:49 -05:00			`seen = 'FALSE'`
get quoted + no repeats 2020-10-29 12:24:09 -04:00
update scraper 2020-12-05 15:17:49 -05:00			`### Set basic tweet attributes`
add tweet scraper 2020-10-28 21:39:54 -04:00			`retweetcount = tweet.retweet_count`
			`favorites = tweet.favorite_count`
			`username = tweet.user.screen_name`
			`id = "https://twitter.com/" + username + "/status/" + tweet.id_str`
update scraper 2020-12-05 15:17:49 -05:00			`idstr = tweet.id_str`
add tweet scraper 2020-10-28 21:39:54 -04:00			`text = tweet.full_text`
			`hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]`
update scraper 2020-12-05 15:17:49 -05:00			`created = str(tweet.created_at)`

			`#### Only add line to csv if it's not already been added`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`if hasattr(tweet, 'quoted_status'):`
update scraper 2020-12-05 15:17:49 -05:00			`quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str`
			`print("Quoted ID " + quotedid)`
			`if quotedid in lines_seen:`
			`print("Quoted Status Seen")`
			`seen = 'TRUE'`
remove printlines, doubles, don't scrape twitter links 2020-10-29 17:32:18 -04:00			`for y in lines_seen:`
			`if id == y:`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`seen = 'TRUE'`
update scraper 2020-12-05 15:17:49 -05:00			`for q in text_seen:`
			`if text == q:`
			`seen = 'TRUE'`
			`if seen == 'TRUE' or username == "Docbasia": continue`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`else:`
update scraper 2020-12-05 15:17:49 -05:00			`### Keep track of seen lines \ tweets`
			`lines_seen.append(id)`
			`text_seen.append(text)`
			`### Check for reply id`
			`try:`
			`reply = tweet.in_reply_to_status_id_str`
			`user = tweet.in_reply_to_user_id_str`
			`replink = "https://twitter.com/" + user + "/" + reply`
			`except:`
			`print("no reply url")`
			`### Check for images in tweet`
			`if 'media' in tweet.entities:`
			`for media in tweet.extended_entities['media']:`
			`medias.append(media['media_url_https'])`
			`### Check for urls in tweet`
			`if 'urls' in tweet.entities:`
			`for url in tweet.entities['urls']:`
			`lkn = url['expanded_url']`
			`lnks.append(url['expanded_url'])`
			`### Look for metadata`
			`from webpreview import web_preview`
			`### Unless link is an image pdf twitter or insta`
			`if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue`
			`else:`
			`try:`
			`### get title img description`
			`print('>>Getting Link Metadata<<')`
			`headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}`
			`tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')`
			`title.append(tit)`
			`description.append(desc)`
			`image.append(ima)`
			`except:`
			`print("broken link")`
			`### If it's a quote-tweet, get original stats`
			`if hasattr(tweet, 'quoted_status'):`
			`print("Quoted Status NotSeen")`
			`qtmedia = ['']`
			`qturls = ['']`
			`qttext = tweet.quoted_status.full_text`
			`qtuser = tweet.quoted_status.user.screen_name`
			`qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str`
			`if 'media' in tweet.quoted_status.entities:`
			`for media in tweet.quoted_status.extended_entities['media']:`
			`qtmedia.append(media['media_url_https'])`
			`if 'urls' in tweet.quoted_status.entities:`
			`for url in tweet.quoted_status.entities['urls']:`
			`qturls.append(url['expanded_url'])`
			`#### Column attributes`
			`line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]`

			`#### Write row to CSV and print line`
get quoted + no repeats 2020-10-29 12:24:09 -04:00			`csvWriter.writerow(line)`
			`print(line)`
add tweet scraper 2020-10-28 21:39:54 -04:00			`csvFile.close()`
update scraper 2020-12-05 15:17:49 -05:00			`print("Complete")`