mirror of
https://github.com/Decentralized-ID/decentralized-id.github.io.git
synced 2024-12-17 11:34:30 -05:00
update scraper
This commit is contained in:
parent
3a1b5a4e82
commit
b6f3da5ebf
134
twscraper.py
134
twscraper.py
@ -8,13 +8,14 @@ CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')
|
||||
ACCESS_KEY = os.environ.get('ACCESS_KEY')
|
||||
ACCESS_SECRET = os.environ.get('ACCESS_SECRET')
|
||||
|
||||
|
||||
#### Authorization
|
||||
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
|
||||
auth.set_access_token(access_token, access_token_secret)
|
||||
api = tweepy.API(auth,wait_on_rate_limit=True)
|
||||
|
||||
#### Keywords
|
||||
keywords = ['#SelfSovereignIdentity', '#DecentralizedIdentity', '#decentralizedidentity', "#IIW"]
|
||||
keywords = ["#verifiablecredentials","#selfsovereignidentity","Self+Sovereign+Identity","Hyperledger+Aries","DIDComm","Key+Event+Receipt+Infrastructure","#ToIP","#TrustoverIP","w3c+Credentials"]
|
||||
|
||||
#### Get Date
|
||||
from datetime import date, timedelta, datetime
|
||||
@ -26,21 +27,20 @@ now = datetime.now()
|
||||
date_time = current_date.strftime("%m%d%y")
|
||||
|
||||
#### Open CSV + Write Column Names
|
||||
fname = 'ssitw' + date_time + '.csv'
|
||||
fname = 'SSI-DID_' + date_time + '.csv'
|
||||
csvFile = open(fname, 'w+')
|
||||
csvWriter = csv.writer(csvFile)
|
||||
csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"])
|
||||
csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])
|
||||
lines_seen = []
|
||||
|
||||
text_seen = []
|
||||
|
||||
for keyword in keywords:
|
||||
# Search hashtags\keywords
|
||||
for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items():
|
||||
### Set \ Reset Variables
|
||||
for tweet in tweepy.Cursor(api.search,q=keyword + ' -filter:retweets', count=100, tweet_mode="extended", lang="en", since=days_before).items():
|
||||
### Reset Variables
|
||||
medias = []
|
||||
lnks = []
|
||||
replink = ""
|
||||
rtsr = []
|
||||
title = []
|
||||
description = []
|
||||
image = []
|
||||
@ -48,70 +48,86 @@ for keyword in keywords:
|
||||
qttext = ''
|
||||
qtmedia = ['']
|
||||
qturls = ['']
|
||||
seen = 'FALSE'
|
||||
|
||||
### Set basic tweet attributes
|
||||
retweetcount = tweet.retweet_count
|
||||
favorites = tweet.favorite_count
|
||||
following = tweet.user.friends_count
|
||||
followers = tweet.user.followers_count
|
||||
username = tweet.user.screen_name
|
||||
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
||||
idstr = tweet.id_str
|
||||
text = tweet.full_text
|
||||
hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]
|
||||
created = tweet.created_at
|
||||
try:
|
||||
reply = tweet.in_reply_to_status_id_str
|
||||
user = tweet.in_reply_to_user_id_str
|
||||
replink = "https://twitter.com/" + user + "/" + reply
|
||||
except:
|
||||
print("no reply url")
|
||||
if 'media' in tweet.entities:
|
||||
for media in tweet.extended_entities['media']:
|
||||
medias.append(media['media_url_https'])
|
||||
### Add URLs to array
|
||||
if 'urls' in tweet.entities:
|
||||
quoted = 'FALSE'
|
||||
for url in tweet.entities['urls']:
|
||||
lkn = url['expanded_url']
|
||||
lnks.append(url['expanded_url'])
|
||||
from webpreview import web_preview
|
||||
if username == "Docbasia" or 'twitter.com' in lkn: continue
|
||||
else:
|
||||
try:
|
||||
### get title img description
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
|
||||
tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
|
||||
title.append(tit)
|
||||
description.append(desc)
|
||||
image.append(ima)
|
||||
except:
|
||||
print("broken link")
|
||||
### If it's a quote-tweet, get original stats
|
||||
created = str(tweet.created_at)
|
||||
|
||||
#### Only add line to csv if it's not already been added
|
||||
if hasattr(tweet, 'quoted_status'):
|
||||
qtmedia = ['']
|
||||
qturls = ['']
|
||||
qttext = tweet.quoted_status.full_text
|
||||
qtuser = tweet.quoted_status.user.screen_name
|
||||
qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
|
||||
if 'media' in tweet.quoted_status.entities:
|
||||
for media in tweet.quoted_status.extended_entities['media']:
|
||||
qtmedia.append(media['media_url_https'])
|
||||
if 'urls' in tweet.quoted_status.entities:
|
||||
for url in tweet.quoted_status.entities['urls']:
|
||||
qturls.append(url['expanded_url'])
|
||||
seen = 'FALSE'
|
||||
count = 0
|
||||
quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
|
||||
print("Quoted ID " + quotedid)
|
||||
if quotedid in lines_seen:
|
||||
print("Quoted Status Seen")
|
||||
seen = 'TRUE'
|
||||
for y in lines_seen:
|
||||
if id == y:
|
||||
seen = 'TRUE'
|
||||
|
||||
#### Column attributes
|
||||
line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
||||
|
||||
#### Only add line to csv if it's not already been added
|
||||
lines_seen.append(id)
|
||||
if seen == 'TRUE' or username == "Docbasia" :
|
||||
print("Seen. Don't Save.")
|
||||
for q in text_seen:
|
||||
if text == q:
|
||||
seen = 'TRUE'
|
||||
if seen == 'TRUE' or username == "Docbasia": continue
|
||||
else:
|
||||
### Keep track of seen lines \ tweets
|
||||
lines_seen.append(id)
|
||||
text_seen.append(text)
|
||||
### Check for reply id
|
||||
try:
|
||||
reply = tweet.in_reply_to_status_id_str
|
||||
user = tweet.in_reply_to_user_id_str
|
||||
replink = "https://twitter.com/" + user + "/" + reply
|
||||
except:
|
||||
print("no reply url")
|
||||
### Check for images in tweet
|
||||
if 'media' in tweet.entities:
|
||||
for media in tweet.extended_entities['media']:
|
||||
medias.append(media['media_url_https'])
|
||||
### Check for urls in tweet
|
||||
if 'urls' in tweet.entities:
|
||||
for url in tweet.entities['urls']:
|
||||
lkn = url['expanded_url']
|
||||
lnks.append(url['expanded_url'])
|
||||
### Look for metadata
|
||||
from webpreview import web_preview
|
||||
### Unless link is an image pdf twitter or insta
|
||||
if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue
|
||||
else:
|
||||
try:
|
||||
### get title img description
|
||||
print('>>Getting Link Metadata<<')
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
|
||||
tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
|
||||
title.append(tit)
|
||||
description.append(desc)
|
||||
image.append(ima)
|
||||
except:
|
||||
print("broken link")
|
||||
### If it's a quote-tweet, get original stats
|
||||
if hasattr(tweet, 'quoted_status'):
|
||||
print("Quoted Status NotSeen")
|
||||
qtmedia = ['']
|
||||
qturls = ['']
|
||||
qttext = tweet.quoted_status.full_text
|
||||
qtuser = tweet.quoted_status.user.screen_name
|
||||
qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
|
||||
if 'media' in tweet.quoted_status.entities:
|
||||
for media in tweet.quoted_status.extended_entities['media']:
|
||||
qtmedia.append(media['media_url_https'])
|
||||
if 'urls' in tweet.quoted_status.entities:
|
||||
for url in tweet.quoted_status.entities['urls']:
|
||||
qturls.append(url['expanded_url'])
|
||||
#### Column attributes
|
||||
line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
||||
|
||||
#### Write row to CSV and print line
|
||||
csvWriter.writerow(line)
|
||||
print(line)
|
||||
csvFile.close()
|
||||
print("Complete")
|
Loading…
Reference in New Issue
Block a user