Source code for tweepyclean.tweepyclean

import pandas as pd
import altair as alt
import re  # Needed for clean_tweets()
from nltk.sentiment.vader import (
    SentimentIntensityAnalyzer,
)  # Needed for clean_tweets()
import textstat  # Needed for clean_tweets()
import emoji  # Needed for clean_tweets()
import tweepy  # Needed to check for


[docs]def raw_df(tweepy_items): """ Creates a dataframe with labeled columns from a tweepy.cursor.ItemIterator object. Parameters ---------- tweepy_items : tweepy.cursor.ItemIterator Input Iterator object generated using the tweepy package Returns ------- pd.DataFrame(tweet_search_results): pandas.core.frame.DataFrame Dataframe with up to 31 labeled columns based on the ItemIterator. Examples: -------- #>>> raw_df(tweets) """ if not isinstance(tweepy_items, tweepy.cursor.ItemIterator): raise TypeError( "tweepy_items should be of type tweepy.cursor.ItemIterator") import pandas as pd tweet_search_results = [] for status in tweepy_items: tweet_search_results.append(status._json) return pd.DataFrame(tweet_search_results)
[docs]def clean_tweets( tweets_df, handle="", text_only=True, word_count=True, emojis=True, hashtags=True, sentiment=True, flesch_readability=True, proportion_of_avg_retweets=True, proportion_of_avg_favorites=True, ): """ Adds new columns based on the data in the raw_df() pandas.dataframe output Parameters ---------- raw_dataframe: pandas.core.frame.DataFrame Dataframe generated by raw_tweets() which will have columns added to it handle: string, optional String which adds adds a column containing the a specified twitter handle, (default is none and adds no column) text_only : bool, optional Bool which specifies to add a column of the tweet text containing no emojis, links, hashtags, or mentions (default is True) emojis: bool, optional Bool which specifies to add a column of the extracted emojis from tweet text and places them in their own column (default is True) hashtags: bool, optional Bool which specifies to add a column of the extracted hashtags from tweet text (default is True) sentiment: bool, optional Bool which specifies to add a column containing the nltk.sentiment.vader SentimentIntensityAnalyzer sentiment score for each tweet (default is True) flesch_readability: bool, optional Bool which specifies to add a column containing the textstat flesch readability score (default is True) proportion_of_avg_retweets: bool, optional Bool which specifies to add a column containing a proportion value of how many retweets a tweet received compared to the account average (default is True) proportion_of_avg_favorites: bool, optional Bool which specifies to add a column containing a proportion value of how many favorites a tweet received compared to the account average (default is True) Returns ------- tweets_df_extra: pandas.core.frame.DataFrame Pandas dataframe containing the additional columns specified by the user. Examples -------- #>>> extra_cols(tweets_df) #>>> extra_cols(tweets_df, flesch_readability = False) #>>> extra_cols(tweets_df, emojis = False, hashtags = False) #>>> extra_cols(tweets_df, sentiment = False) """ if not isinstance(tweets_df, pd.DataFrame): raise TypeError("clean_dataframe should be of type pd.DataFrame") if not (isinstance(handle, str) or isinstance(handle, int)): raise TypeError("handle should be of type str or Int") if "retweet_count" not in tweets_df.columns: raise ValueError( """ input dataframe does not contain retweet_count column needed to generate proportion_of_avg_retweets """ ) if "favorite_count" not in tweets_df.columns: raise ValueError( """ input dataframe does not contain favorite_count column needed to generate proportion_of_avg_favorites """ ) if "full_text" not in tweets_df.columns: raise ValueError( """input dataframe does not contain full_text column needed to generate text_only and sentiment """) mod = SentimentIntensityAnalyzer() tweets_df_new = tweets_df.copy(deep=False) # Used to add user entered handle if handle != "": # Need test handle_str = str(handle) tweets_df_new["handle"] = handle_str # Calculate average number of retweets/hearts for use in proportion later if proportion_of_avg_retweets: avg_retweets = tweets_df_new["retweet_count"].mean() if proportion_of_avg_favorites: avg_favorites = tweets_df_new["favorite_count"].mean() for i in range(len(tweets_df_new)): # Loop to run through each tweet # entity = tweets_df_new.at[ # i, "entities" # ] # extract entities column containing hashtags. # Get text of current tweet tweet_text = tweets_df_new.at[i, "full_text"] # Text only extraction text_only_col = re.sub( r"\$\w+[,]|\@\w+|[,]\$\w+", "", tweet_text ) # removes '@'s text_only_col = re.sub( r"\$\w+[,]|\#\w+|[,]\$\w+", "", text_only_col ) # removes hashtags text_only_col = re.sub( r"http\S+", "", text_only_col) # removes https links text_only_col = text_only_col.replace("\n", " ") # remove \n newlines # text_only_col = give_emoji_free_text(text_only_col) text_only_col = re.sub(emoji.get_emoji_regexp(), r"", text_only_col) text_only_col = text_only_col.strip(":").lstrip() text_only_col = ( text_only_col.lstrip().rstrip() ) # lstrip removes leading whitespace, rstrip trailing # Only store text_only if the user wants it if text_only: tweets_df_new.at[i, "text_only"] = text_only_col # New column # Wordcount if word_count: tweets_df_new.at[i, "word_count"] = len( text_only_col.split()) # New column # Emoji extraction if emojis: emoji_list = [] emoji_list = re.findall( emoji.get_emoji_regexp(), tweet_text ) # Extracts hashtags and puts them in dict # num_emojis = len(emoji_list) if emoji_list == []: emoji_list = "" tweets_df_new.at[i, "emojis"] = emoji_list # New column # Hashtag extraction if hashtags: hashtags = [] hashtags = re.findall( ".*?\\s(#\\w+).*?", tweet_text ) # Extracts hashtags and puts them in dict # num_hashtags = len(hashtags) if hashtags == []: hashtags = "" tweets_df_new.at[i, "hashtags"] = hashtags # New column # Sentiment score generation if sentiment: tweets_df_new.at[i, "sentiment_polarity"] = mod.polarity_scores( str(text_only_col) )[ "compound" ] # New column # flesch_readability if flesch_readability: tweets_df_new.at[ i, "flesch_readability_score" ] = textstat.flesch_reading_ease( text_only_col ) # New column # proportion_of_avg_retweets if proportion_of_avg_retweets: tweets_df_new.at[i, "prptn_rts_vs_avg"] = ( tweets_df_new.at[i, "retweet_count"] / avg_retweets ) # New column # proportion_of_avg_favorites if proportion_of_avg_favorites: tweets_df_new.at[i, "proportion_favorites_vs_avg"] = ( tweets_df_new.at[i, "favorite_count"] / avg_favorites ) # New column return tweets_df_new
[docs]def tweet_words(clean_dataframe, top_n=1): """ Returns the most common words and counts from a list of tweets. The output is sorted descending by the count of words and in reverse alphabetical order for any word ties. Parameters ---------- clean_dataframe : pandas.DataFrame A processed dataframe containing a user's tweet history and associated information top_n : int An integer representing the the number of most common words to display Returns ------- pandas.DataFrame A dataframe with one column containing individual words and a second column with the count of each word Examples: -------- #>>> tweet_words(dataframe, 3) pd.DataFrame(data = {'words' : ['best', 'apple', 'news'], 'count' : [102, 52, 24]}) """ # check input type of clean_dataframe if not isinstance(clean_dataframe, pd.DataFrame): raise TypeError("clean_dataframe should be of type pd.DataFrame") # check input of top_n if not isinstance(top_n, int): raise TypeError("top_n should be of type Int") # check if top_n is greater than 0 if top_n == 0: raise ValueError("top_n must be greater than 0") # keep only the necessary column to count words clean_text_column = "text_only" split_words_df = clean_dataframe[clean_text_column].str.split().explode() output = split_words_df.value_counts().to_frame() # index and column transformations output["words"] = output.index output.reset_index(inplace=True, drop=True) output.rename(columns={"text_only": "count"}, inplace=True) output = output[["words", "count"]] # sort by alphabetical while preserving numerical sort output = output.sort_values(["count", "words"], ascending=False) output.reset_index(inplace=True, drop=True) # select top_n if top_n >= output.shape[0]: output = output else: output = output.iloc[0:top_n, :] return output
[docs]def sentiment_total(tweets, drop_sentiment=False): """ Takes an input of of single english words and outputs the number of words associated with eight emotions and positive/negative sentiment. This is based on the the crowd-sourced NRC Emotion Lexicon, which associates words with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). For more information on NRC: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm Note that words can be 0:n with emotions (either associated with none, 1, or many). Parameters: ------- data: pandas.DataFrame or np.array A list or single column dataframe of single words. drop_sentiment: boolean drop emotion/sentiment rows if no words are associated with them. Default is False. Returns: -------- pandas.DataFrame Examples: --------- #>>> sentiment(df, drop_sentiment = True) 3 x 5 sentiment word_count total_words <chr> <int> <dbl> anger 1 4 disgust 2 4 fear 1 4 negative 2 4 sadness 1 4 """ tweet_words = pd.DataFrame({"word": tweets.str.split().explode()}) total_words = len(tweet_words) emotion_lexicon_df = pd.read_csv( "data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep="\t" ) # NRC dataset tweet_words_sentiment = pd.merge( tweet_words, emotion_lexicon_df, how="inner") # if user deviates from default parameter drop 0 count sentiments if drop_sentiment: tweet_words_sentiment = tweet_words_sentiment[ tweet_words_sentiment["count"] == 1 ] # get aggregated sentiment-words counts tweet_words_sentiment = tweet_words_sentiment.groupby( ["sentiment"], as_index=False ).sum() tweet_words_sentiment = tweet_words_sentiment.rename( columns={"count": "word_count"} ) tweet_words_sentiment["total_words"] = total_words return tweet_words_sentiment
[docs]def engagement_by_hour(tweets_df): """ Creates a line chart of average number of likes and retweets received based on hour of tweet posted. Parameters ---------- tweets : pandas.DataFrame A processed dataframe containing a user's tweet history and associated information Returns ------- An Altair graph object (line chart) of average engagement received by hour of tweet posted Examples -------- #>>> engagement_by_hour(tweets_df) """ # check input type of tweets_df if not isinstance(tweets_df, pd.DataFrame): raise TypeError("Input should be of type pd.DataFrame") # Wrangle data tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"]) tweets_df["hour"] = tweets_df.created_at.dt.hour tweets_df["total_engagement"] = ( tweets_df["retweet_count"] + tweets_df["favorite_count"] ) grouped_df = tweets_df.groupby( "hour")["total_engagement"].mean().reset_index() # Plot chart chart = ( alt.Chart( grouped_df, title=''' Average engagement (likes + retweets) by hour ''') .mark_line() .encode( alt.X("hour"), alt.Y("total_engagement"))) return chart