test twitter collection

2025-07-24 23:15:33 -04:00 · 2021-01-22 22:15:32 -05:00 · 2021-01-22 22:15:32 -05:00 · 08ca6cb42e
commit 08ca6cb42e
parent dc9270e1ee
3 changed files with 173 additions and 22 deletions
--- a/.github/workflows/twitter.yml
+++ b/.github/workflows/twitter.yml
@ -0,0 +1,46 @@
+# This is a basic workflow to help you get started with Actions
+
+name: twitter collections
+
+# Controls when the action will run. Triggers the workflow on push or pull request
+# events but only for the master branch
+on:
+  schedule:
+    - cron: '0 18 * * 6'
+  push:
+    paths:
+    - 'twscraper.py'
+    - '.github/workflows/twitter.yml'
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.9.0' # Version range or exact version of a Python version to use, using SemVer's version range syntax
+      - uses: py-actions/py-dependency-install@v2   
+        with:
+          path: requirements.txt
+      - shell: bash
+        env: # Or as an environment variable
+          CONSUMER_KEY: ${{ secrets.CONSUMER_KEY }}
+          CONSUMER_SECRET: ${{ secrets.CONSUMER_SECRET }}
+          ACCESS_KEY: ${{ secrets.ACCESS_KEY }}
+          ACCESS_SECRET: ${{ secrets.ACCESS_SECRET }}
+        run: python twscraper.py
+      - name: Deploy Changes
+        run: |
+          git remote add gh-token "https://github.com/decentralized-id/decentralized-id.github.io.git"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git pull
+          git commit -a -m "add twitter collection"
+          git push gh-token master
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+tweepy
+webpreview
--- a/twscraper.py
+++ b/twscraper.py
@ -1,13 +1,11 @@
 import tweepy
 import csv
-import pandas as pd

 #### Credentials
-CONSUMER_KEY = os.environ.get('CONSUMER_KEY')
-CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')
-ACCESS_KEY = os.environ.get('ACCESS_KEY')
-ACCESS_SECRET = os.environ.get('ACCESS_SECRET')
-
+consumer_key = os.environ.get('CONSUMER_KEY')
+consumer_secret = os.environ.get('CONSUMER_SECRET')
+access_token = os.environ.get('ACCESS_KEY')
+access_token_secret = os.environ.get('ACCESS_SECRET')

 #### Authorization
 auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
@ -24,15 +22,18 @@ from datetime import date, timedelta, datetime
 current_date = date.today()   
 days_before = (date.today()-timedelta(days=7))
 now = datetime.now()
-date_time = current_date.strftime("%m%d%y")
+date_time = current_date.strftime("%Y-%m-%d")
+startDate = days_before.strftime("%Y-%m-%d, %H:%M:%S")
+

 #### Open CSV + Write Column Names
-fname = 'SSI-DID_' + date_time + '.csv'
+fname = '/_data/twitter/search_' + date_time + '.csv'
 csvFile = open(fname, 'w+')
 csvWriter = csv.writer(csvFile)
-csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])
+csvWriter.writerow(["Time","Link", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteImg", "QuoteUrl"])
 lines_seen = []
 text_seen = []
+tweet_ids = []

 for keyword in keywords:
    # Search hashtags\keywords
@ -51,21 +52,16 @@ for keyword in keywords:
        seen = 'FALSE'

        ### Set basic tweet attributes
-        retweetcount = tweet.retweet_count
-        favorites = tweet.favorite_count
        username = tweet.user.screen_name
        id = "https://twitter.com/" + username + "/status/" + tweet.id_str
        idstr = tweet.id_str
        text = tweet.full_text
-        hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]     
        created = str(tweet.created_at)

        #### Only add line to csv if it's not already been added
        if hasattr(tweet, 'quoted_status'):
            quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
-            print("Quoted ID " + quotedid)
            if quotedid in lines_seen:
-                print("Quoted Status Seen")
                seen = 'TRUE'
        for y in lines_seen:
            if id == y:
@ -73,7 +69,7 @@ for keyword in keywords:
        for q in text_seen:
            if text == q:
                seen = 'TRUE'
-        if seen == 'TRUE' or username == "Docbasia": continue
+        if seen == 'TRUE': continue
        else:
            ### Keep track of seen lines \ tweets
            lines_seen.append(id)
@ -84,7 +80,7 @@ for keyword in keywords:
                user = tweet.in_reply_to_user_id_str
                replink = "https://twitter.com/" + user + "/" + reply
            except:
-                print("no reply url")
+                pass
            ### Check for images in tweet
            if 'media' in tweet.entities:
                for media in tweet.extended_entities['media']:
@ -97,7 +93,7 @@ for keyword in keywords:
                    ### Look for metadata
                    from webpreview import web_preview
                    ### Unless link is an image pdf twitter or insta
-                    if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue
+                    if 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: pass
                    else:
                        try:
                        ### get title img description
@ -108,10 +104,9 @@ for keyword in keywords:
                            description.append(desc)
                            image.append(ima) 
                        except:
-                            print("broken link")
+                            pass
                ### If it's a quote-tweet, get original stats
            if hasattr(tweet, 'quoted_status'):
-                print("Quoted Status NotSeen")
                qtmedia = ['']
                qturls = ['']
                qttext = tweet.quoted_status.full_text
@ -124,10 +119,118 @@ for keyword in keywords:
                    for url in tweet.quoted_status.entities['urls']:
                        qturls.append(url['expanded_url'])
            #### Column attributes
-            line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
+            line = [created, id, lnks, title, description, image, medias, replink, qtid, qtmedia, qturls]

            #### Write row to CSV and print line
            csvWriter.writerow(line)
-            print(line)
+            tweet_ids.append(idstr)
+            print(idstr)
+
+#### Get USER Tweets
+tweets = []
+ids = []
+tmpTweets = api.user_timeline('DecentralizeID')
+for tweet in tmpTweets:
+    created = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
+    if created < date_time and created > startDate:
+        tweets.append(tweet)
+
+while (tmpTweets[-1].created_at.strftime("%Y-%m-%d, %H:%M:%S") > startDate):
+    print("Last Tweet @", tmpTweets[-1].created_at, " - fetching some more")
+    tmpTweets = api.user_timeline(username, max_id = tmpTweets[-1].id)
+    for tweet in tmpTweets:
+        createdate = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
+        if createdate < date_time and createdate > startdate:
+            tweets.append(tweet)
+
+
+for tweet in tweets:
+    created = str(tweet.created_at)
+    id = "https://twitter.com/" + username + "/status/" + tweet.id_str
+    idstr = str(tweet.id_str)
+    username = tweet.user.screen_name
+    if hasattr(tweet, 'text'):
+        text = tweet.text
+    if hasattr(tweet, 'full_text'):
+        text = tweet.full_text
+    try: 
+        username = tweet.retweeted_status.user.screen_name
+        id = "https://twitter.com/" + tweet.retweeted_status.user.screen_name + "/status/" + tweet.retweeted_status.id_str
+        idstr = tweet.retweeted_status.id_str
+    except:
+        pass
+    if id not in ids:
+        ids.append(id)
+        tweet_ids.append(idstr)
+        line = [created, id, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
+        #### Write row to CSV and print line
+        csvWriter.writerow(line)
 csvFile.close()
-print("Complete")
+print(tweet_ids)
+
+# Create Collection
+from requests_oauthlib import OAuth1Session
+import json
+## OAuth vs Tweepy auth, idk why can't create collection with above tweepy auth
+twitter = OAuth1Session(consumer_key,
+                        client_secret=consumer_secret,
+                        resource_owner_key=access_token,
+                        resource_owner_secret=access_token_secret)
+
+# create
+url = 'https://api.twitter.com/1.1/collections/create.json'
+params_create = {
+    'name': 'Decentralized-ID Curated ' + date_time,
+    'description': 'Decentralized Identity Curated Tweets by @infominer33 via identosphere.net',
+    'timeline_order': 'tweet_chron'
+    }
+r = twitter.post(url, data=params_create)
+print(r.json())
+print(r.json()['response'])
+# 'response': {'timeline_id': 'custom-1180945428222595074'}}
+## Extract ID from response
+res = str(r.json()['response'])
+ss1 = "{'timeline_id': 'custom-"
+ss2 = "'}"
+resp = res.removeprefix(ss1)
+response = resp.removesuffix(ss2)
+
+timeline_id = r.json()['response']['timeline_id']
+# the collection can be viewed at, eg: https://twitter.com/laurenfratamico/timelines/1180945428222595074
+
+# bulk add
+url = 'https://api.twitter.com/1.1/collections/entries/curate.json'
+# split into batches of 100 for the uploads
+n = 100
+batches = [tweet_ids[i:i + n] for i in range(0, len(tweet_ids), n)]
+print (len(batches))
+
+for batch in batches:
+    params_add = {
+        "id": timeline_id,
+        "changes": []
+    }
+    for tweet_id in batch:
+        sub_params_add = {
+            "tweet_id": str(tweet_id),
+            "op": "add"
+        }
+        params_add['changes'].append(sub_params_add)
+    
+    r = twitter.post(url, data=json.dumps(params_add))
+    print(r.json())
+
+file_name = "/_posts/twitter/" + str(date_time) + '-twitter.md'
+f = open(file_name,"w+")
+
+str1 = "---\n"
+str2 = 'title: "Twitter Collection – ' + date_time + '"\n'
+str3 = 'description: "Collection of tweets on decentralized identity – ' + date_time + '"\n'
+str4 = "last_modified_at: " + date_time + '\n'
+str5 = "---\n"
+str6 = "\n\n"
+str7 = '<a class="twitter-timeline" href="https://twitter.com/DecentralizeID/timelines/' + response + '">Decentralized Identity - Curated ' + date_time + '</a> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
+
+L = [str1, str2, str3, str4, str5, str6, str7] 
+f.writelines(L)
+f.close()