mirror of
https://github.com/Decentralized-ID/decentralized-id.github.io.git
synced 2025-06-07 06:02:47 -04:00
test twitter collection
This commit is contained in:
parent
dc9270e1ee
commit
08ca6cb42e
3 changed files with 173 additions and 22 deletions
46
.github/workflows/twitter.yml
vendored
Normal file
46
.github/workflows/twitter.yml
vendored
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
# This is a basic workflow to help you get started with Actions
|
||||||
|
|
||||||
|
name: twitter collections
|
||||||
|
|
||||||
|
# Controls when the action will run. Triggers the workflow on push or pull request
|
||||||
|
# events but only for the master branch
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 18 * * 6'
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- 'twscraper.py'
|
||||||
|
- '.github/workflows/twitter.yml'
|
||||||
|
|
||||||
|
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
|
||||||
|
jobs:
|
||||||
|
# This workflow contains a single job called "build"
|
||||||
|
build:
|
||||||
|
# The type of runner that the job will run on
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
# Steps represent a sequence of tasks that will be executed as part of the job
|
||||||
|
steps:
|
||||||
|
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: '3.9.0' # Version range or exact version of a Python version to use, using SemVer's version range syntax
|
||||||
|
- uses: py-actions/py-dependency-install@v2
|
||||||
|
with:
|
||||||
|
path: requirements.txt
|
||||||
|
- shell: bash
|
||||||
|
env: # Or as an environment variable
|
||||||
|
CONSUMER_KEY: ${{ secrets.CONSUMER_KEY }}
|
||||||
|
CONSUMER_SECRET: ${{ secrets.CONSUMER_SECRET }}
|
||||||
|
ACCESS_KEY: ${{ secrets.ACCESS_KEY }}
|
||||||
|
ACCESS_SECRET: ${{ secrets.ACCESS_SECRET }}
|
||||||
|
run: python twscraper.py
|
||||||
|
- name: Deploy Changes
|
||||||
|
run: |
|
||||||
|
git remote add gh-token "https://github.com/decentralized-id/decentralized-id.github.io.git"
|
||||||
|
git config user.name "github-actions[bot]"
|
||||||
|
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||||
|
git pull
|
||||||
|
git commit -a -m "add twitter collection"
|
||||||
|
git push gh-token master
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
tweepy
|
||||||
|
webpreview
|
147
twscraper.py
147
twscraper.py
|
@ -1,13 +1,11 @@
|
||||||
import tweepy
|
import tweepy
|
||||||
import csv
|
import csv
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
#### Credentials
|
#### Credentials
|
||||||
CONSUMER_KEY = os.environ.get('CONSUMER_KEY')
|
consumer_key = os.environ.get('CONSUMER_KEY')
|
||||||
CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')
|
consumer_secret = os.environ.get('CONSUMER_SECRET')
|
||||||
ACCESS_KEY = os.environ.get('ACCESS_KEY')
|
access_token = os.environ.get('ACCESS_KEY')
|
||||||
ACCESS_SECRET = os.environ.get('ACCESS_SECRET')
|
access_token_secret = os.environ.get('ACCESS_SECRET')
|
||||||
|
|
||||||
|
|
||||||
#### Authorization
|
#### Authorization
|
||||||
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
|
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
|
||||||
|
@ -24,15 +22,18 @@ from datetime import date, timedelta, datetime
|
||||||
current_date = date.today()
|
current_date = date.today()
|
||||||
days_before = (date.today()-timedelta(days=7))
|
days_before = (date.today()-timedelta(days=7))
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
date_time = current_date.strftime("%m%d%y")
|
date_time = current_date.strftime("%Y-%m-%d")
|
||||||
|
startDate = days_before.strftime("%Y-%m-%d, %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
#### Open CSV + Write Column Names
|
#### Open CSV + Write Column Names
|
||||||
fname = 'SSI-DID_' + date_time + '.csv'
|
fname = '/_data/twitter/search_' + date_time + '.csv'
|
||||||
csvFile = open(fname, 'w+')
|
csvFile = open(fname, 'w+')
|
||||||
csvWriter = csv.writer(csvFile)
|
csvWriter = csv.writer(csvFile)
|
||||||
csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])
|
csvWriter.writerow(["Time","Link", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteImg", "QuoteUrl"])
|
||||||
lines_seen = []
|
lines_seen = []
|
||||||
text_seen = []
|
text_seen = []
|
||||||
|
tweet_ids = []
|
||||||
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
# Search hashtags\keywords
|
# Search hashtags\keywords
|
||||||
|
@ -51,21 +52,16 @@ for keyword in keywords:
|
||||||
seen = 'FALSE'
|
seen = 'FALSE'
|
||||||
|
|
||||||
### Set basic tweet attributes
|
### Set basic tweet attributes
|
||||||
retweetcount = tweet.retweet_count
|
|
||||||
favorites = tweet.favorite_count
|
|
||||||
username = tweet.user.screen_name
|
username = tweet.user.screen_name
|
||||||
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
||||||
idstr = tweet.id_str
|
idstr = tweet.id_str
|
||||||
text = tweet.full_text
|
text = tweet.full_text
|
||||||
hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]
|
|
||||||
created = str(tweet.created_at)
|
created = str(tweet.created_at)
|
||||||
|
|
||||||
#### Only add line to csv if it's not already been added
|
#### Only add line to csv if it's not already been added
|
||||||
if hasattr(tweet, 'quoted_status'):
|
if hasattr(tweet, 'quoted_status'):
|
||||||
quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
|
quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
|
||||||
print("Quoted ID " + quotedid)
|
|
||||||
if quotedid in lines_seen:
|
if quotedid in lines_seen:
|
||||||
print("Quoted Status Seen")
|
|
||||||
seen = 'TRUE'
|
seen = 'TRUE'
|
||||||
for y in lines_seen:
|
for y in lines_seen:
|
||||||
if id == y:
|
if id == y:
|
||||||
|
@ -73,7 +69,7 @@ for keyword in keywords:
|
||||||
for q in text_seen:
|
for q in text_seen:
|
||||||
if text == q:
|
if text == q:
|
||||||
seen = 'TRUE'
|
seen = 'TRUE'
|
||||||
if seen == 'TRUE' or username == "Docbasia": continue
|
if seen == 'TRUE': continue
|
||||||
else:
|
else:
|
||||||
### Keep track of seen lines \ tweets
|
### Keep track of seen lines \ tweets
|
||||||
lines_seen.append(id)
|
lines_seen.append(id)
|
||||||
|
@ -84,7 +80,7 @@ for keyword in keywords:
|
||||||
user = tweet.in_reply_to_user_id_str
|
user = tweet.in_reply_to_user_id_str
|
||||||
replink = "https://twitter.com/" + user + "/" + reply
|
replink = "https://twitter.com/" + user + "/" + reply
|
||||||
except:
|
except:
|
||||||
print("no reply url")
|
pass
|
||||||
### Check for images in tweet
|
### Check for images in tweet
|
||||||
if 'media' in tweet.entities:
|
if 'media' in tweet.entities:
|
||||||
for media in tweet.extended_entities['media']:
|
for media in tweet.extended_entities['media']:
|
||||||
|
@ -97,7 +93,7 @@ for keyword in keywords:
|
||||||
### Look for metadata
|
### Look for metadata
|
||||||
from webpreview import web_preview
|
from webpreview import web_preview
|
||||||
### Unless link is an image pdf twitter or insta
|
### Unless link is an image pdf twitter or insta
|
||||||
if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue
|
if 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: pass
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
### get title img description
|
### get title img description
|
||||||
|
@ -108,10 +104,9 @@ for keyword in keywords:
|
||||||
description.append(desc)
|
description.append(desc)
|
||||||
image.append(ima)
|
image.append(ima)
|
||||||
except:
|
except:
|
||||||
print("broken link")
|
pass
|
||||||
### If it's a quote-tweet, get original stats
|
### If it's a quote-tweet, get original stats
|
||||||
if hasattr(tweet, 'quoted_status'):
|
if hasattr(tweet, 'quoted_status'):
|
||||||
print("Quoted Status NotSeen")
|
|
||||||
qtmedia = ['']
|
qtmedia = ['']
|
||||||
qturls = ['']
|
qturls = ['']
|
||||||
qttext = tweet.quoted_status.full_text
|
qttext = tweet.quoted_status.full_text
|
||||||
|
@ -124,10 +119,118 @@ for keyword in keywords:
|
||||||
for url in tweet.quoted_status.entities['urls']:
|
for url in tweet.quoted_status.entities['urls']:
|
||||||
qturls.append(url['expanded_url'])
|
qturls.append(url['expanded_url'])
|
||||||
#### Column attributes
|
#### Column attributes
|
||||||
line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
line = [created, id, lnks, title, description, image, medias, replink, qtid, qtmedia, qturls]
|
||||||
|
|
||||||
#### Write row to CSV and print line
|
#### Write row to CSV and print line
|
||||||
csvWriter.writerow(line)
|
csvWriter.writerow(line)
|
||||||
print(line)
|
tweet_ids.append(idstr)
|
||||||
|
print(idstr)
|
||||||
|
|
||||||
|
#### Get USER Tweets
|
||||||
|
tweets = []
|
||||||
|
ids = []
|
||||||
|
tmpTweets = api.user_timeline('DecentralizeID')
|
||||||
|
for tweet in tmpTweets:
|
||||||
|
created = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
|
||||||
|
if created < date_time and created > startDate:
|
||||||
|
tweets.append(tweet)
|
||||||
|
|
||||||
|
while (tmpTweets[-1].created_at.strftime("%Y-%m-%d, %H:%M:%S") > startDate):
|
||||||
|
print("Last Tweet @", tmpTweets[-1].created_at, " - fetching some more")
|
||||||
|
tmpTweets = api.user_timeline(username, max_id = tmpTweets[-1].id)
|
||||||
|
for tweet in tmpTweets:
|
||||||
|
createdate = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
|
||||||
|
if createdate < date_time and createdate > startdate:
|
||||||
|
tweets.append(tweet)
|
||||||
|
|
||||||
|
|
||||||
|
for tweet in tweets:
|
||||||
|
created = str(tweet.created_at)
|
||||||
|
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
||||||
|
idstr = str(tweet.id_str)
|
||||||
|
username = tweet.user.screen_name
|
||||||
|
if hasattr(tweet, 'text'):
|
||||||
|
text = tweet.text
|
||||||
|
if hasattr(tweet, 'full_text'):
|
||||||
|
text = tweet.full_text
|
||||||
|
try:
|
||||||
|
username = tweet.retweeted_status.user.screen_name
|
||||||
|
id = "https://twitter.com/" + tweet.retweeted_status.user.screen_name + "/status/" + tweet.retweeted_status.id_str
|
||||||
|
idstr = tweet.retweeted_status.id_str
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if id not in ids:
|
||||||
|
ids.append(id)
|
||||||
|
tweet_ids.append(idstr)
|
||||||
|
line = [created, id, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
||||||
|
#### Write row to CSV and print line
|
||||||
|
csvWriter.writerow(line)
|
||||||
csvFile.close()
|
csvFile.close()
|
||||||
print("Complete")
|
print(tweet_ids)
|
||||||
|
|
||||||
|
# Create Collection
|
||||||
|
from requests_oauthlib import OAuth1Session
|
||||||
|
import json
|
||||||
|
## OAuth vs Tweepy auth, idk why can't create collection with above tweepy auth
|
||||||
|
twitter = OAuth1Session(consumer_key,
|
||||||
|
client_secret=consumer_secret,
|
||||||
|
resource_owner_key=access_token,
|
||||||
|
resource_owner_secret=access_token_secret)
|
||||||
|
|
||||||
|
# create
|
||||||
|
url = 'https://api.twitter.com/1.1/collections/create.json'
|
||||||
|
params_create = {
|
||||||
|
'name': 'Decentralized-ID Curated ' + date_time,
|
||||||
|
'description': 'Decentralized Identity Curated Tweets by @infominer33 via identosphere.net',
|
||||||
|
'timeline_order': 'tweet_chron'
|
||||||
|
}
|
||||||
|
r = twitter.post(url, data=params_create)
|
||||||
|
print(r.json())
|
||||||
|
print(r.json()['response'])
|
||||||
|
# 'response': {'timeline_id': 'custom-1180945428222595074'}}
|
||||||
|
## Extract ID from response
|
||||||
|
res = str(r.json()['response'])
|
||||||
|
ss1 = "{'timeline_id': 'custom-"
|
||||||
|
ss2 = "'}"
|
||||||
|
resp = res.removeprefix(ss1)
|
||||||
|
response = resp.removesuffix(ss2)
|
||||||
|
|
||||||
|
timeline_id = r.json()['response']['timeline_id']
|
||||||
|
# the collection can be viewed at, eg: https://twitter.com/laurenfratamico/timelines/1180945428222595074
|
||||||
|
|
||||||
|
# bulk add
|
||||||
|
url = 'https://api.twitter.com/1.1/collections/entries/curate.json'
|
||||||
|
# split into batches of 100 for the uploads
|
||||||
|
n = 100
|
||||||
|
batches = [tweet_ids[i:i + n] for i in range(0, len(tweet_ids), n)]
|
||||||
|
print (len(batches))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
params_add = {
|
||||||
|
"id": timeline_id,
|
||||||
|
"changes": []
|
||||||
|
}
|
||||||
|
for tweet_id in batch:
|
||||||
|
sub_params_add = {
|
||||||
|
"tweet_id": str(tweet_id),
|
||||||
|
"op": "add"
|
||||||
|
}
|
||||||
|
params_add['changes'].append(sub_params_add)
|
||||||
|
|
||||||
|
r = twitter.post(url, data=json.dumps(params_add))
|
||||||
|
print(r.json())
|
||||||
|
|
||||||
|
file_name = "/_posts/twitter/" + str(date_time) + '-twitter.md'
|
||||||
|
f = open(file_name,"w+")
|
||||||
|
|
||||||
|
str1 = "---\n"
|
||||||
|
str2 = 'title: "Twitter Collection – ' + date_time + '"\n'
|
||||||
|
str3 = 'description: "Collection of tweets on decentralized identity – ' + date_time + '"\n'
|
||||||
|
str4 = "last_modified_at: " + date_time + '\n'
|
||||||
|
str5 = "---\n"
|
||||||
|
str6 = "\n\n"
|
||||||
|
str7 = '<a class="twitter-timeline" href="https://twitter.com/DecentralizeID/timelines/' + response + '">Decentralized Identity - Curated ' + date_time + '</a> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
|
||||||
|
|
||||||
|
L = [str1, str2, str3, str4, str5, str6, str7]
|
||||||
|
f.writelines(L)
|
||||||
|
f.close()
|
Loading…
Add table
Add a link
Reference in a new issue