Source code for tweepyclean.tweepyclean
import pandas as pd
import altair as alt
import re # Needed for clean_tweets()
from nltk.sentiment.vader import (
SentimentIntensityAnalyzer,
) # Needed for clean_tweets()
import textstat # Needed for clean_tweets()
import emoji # Needed for clean_tweets()
import tweepy # Needed to check for
[docs]def raw_df(tweepy_items):
"""
Creates a dataframe with labeled columns from a
tweepy.cursor.ItemIterator object.
Parameters
----------
tweepy_items : tweepy.cursor.ItemIterator
Input Iterator object generated using the tweepy package
Returns
-------
pd.DataFrame(tweet_search_results): pandas.core.frame.DataFrame
Dataframe with up to 31 labeled columns based on the ItemIterator.
Examples:
--------
#>>> raw_df(tweets)
"""
if not isinstance(tweepy_items, tweepy.cursor.ItemIterator):
raise TypeError(
"tweepy_items should be of type tweepy.cursor.ItemIterator")
import pandas as pd
tweet_search_results = []
for status in tweepy_items:
tweet_search_results.append(status._json)
return pd.DataFrame(tweet_search_results)
[docs]def clean_tweets(
tweets_df,
handle="",
text_only=True,
word_count=True,
emojis=True,
hashtags=True,
sentiment=True,
flesch_readability=True,
proportion_of_avg_retweets=True,
proportion_of_avg_favorites=True,
):
"""
Adds new columns based on the data in the raw_df() pandas.dataframe output
Parameters
----------
raw_dataframe: pandas.core.frame.DataFrame
Dataframe generated by raw_tweets() which will have columns added to it
handle: string, optional
String which adds adds a column containing the a specified twitter
handle, (default is none and adds no column)
text_only : bool, optional
Bool which specifies to add a column of the tweet text containing
no emojis, links, hashtags, or mentions (default is True)
emojis: bool, optional
Bool which specifies to add a column of the extracted emojis from
tweet text and places them in their own column (default is True)
hashtags: bool, optional
Bool which specifies to add a column of the extracted hashtags
from tweet text (default is True)
sentiment: bool, optional
Bool which specifies to add a column containing the
nltk.sentiment.vader SentimentIntensityAnalyzer sentiment score
for each tweet (default is True)
flesch_readability: bool, optional
Bool which specifies to add a column containing the textstat
flesch readability score (default is True)
proportion_of_avg_retweets: bool, optional
Bool which specifies to add a column containing a proportion value
of how many retweets a tweet received compared to the account
average (default is True)
proportion_of_avg_favorites: bool, optional
Bool which specifies to add a column containing a proportion value
of how many favorites a tweet received compared to the account
average (default is True)
Returns
-------
tweets_df_extra: pandas.core.frame.DataFrame
Pandas dataframe containing the additional columns specified
by the user.
Examples
--------
#>>> extra_cols(tweets_df)
#>>> extra_cols(tweets_df, flesch_readability = False)
#>>> extra_cols(tweets_df, emojis = False, hashtags = False)
#>>> extra_cols(tweets_df, sentiment = False)
"""
if not isinstance(tweets_df, pd.DataFrame):
raise TypeError("clean_dataframe should be of type pd.DataFrame")
if not (isinstance(handle, str) or isinstance(handle, int)):
raise TypeError("handle should be of type str or Int")
if "retweet_count" not in tweets_df.columns:
raise ValueError(
"""
input dataframe does not contain retweet_count column needed to
generate proportion_of_avg_retweets
"""
)
if "favorite_count" not in tweets_df.columns:
raise ValueError(
"""
input dataframe does not contain favorite_count column needed to
generate proportion_of_avg_favorites
"""
)
if "full_text" not in tweets_df.columns:
raise ValueError(
"""input dataframe does not contain full_text column needed to generate
text_only and sentiment
""")
mod = SentimentIntensityAnalyzer()
tweets_df_new = tweets_df.copy(deep=False)
# Used to add user entered handle
if handle != "": # Need test
handle_str = str(handle)
tweets_df_new["handle"] = handle_str
# Calculate average number of retweets/hearts for use in proportion later
if proportion_of_avg_retweets:
avg_retweets = tweets_df_new["retweet_count"].mean()
if proportion_of_avg_favorites:
avg_favorites = tweets_df_new["favorite_count"].mean()
for i in range(len(tweets_df_new)): # Loop to run through each tweet
# entity = tweets_df_new.at[
# i, "entities"
# ] # extract entities column containing hashtags.
# Get text of current tweet
tweet_text = tweets_df_new.at[i, "full_text"]
# Text only extraction
text_only_col = re.sub(
r"\$\w+[,]|\@\w+|[,]\$\w+", "", tweet_text
) # removes '@'s
text_only_col = re.sub(
r"\$\w+[,]|\#\w+|[,]\$\w+", "", text_only_col
) # removes hashtags
text_only_col = re.sub(
r"http\S+", "", text_only_col) # removes https links
text_only_col = text_only_col.replace("\n", " ") # remove \n newlines
# text_only_col = give_emoji_free_text(text_only_col)
text_only_col = re.sub(emoji.get_emoji_regexp(), r"", text_only_col)
text_only_col = text_only_col.strip(":").lstrip()
text_only_col = (
text_only_col.lstrip().rstrip()
) # lstrip removes leading whitespace, rstrip trailing
# Only store text_only if the user wants it
if text_only:
tweets_df_new.at[i, "text_only"] = text_only_col # New column
# Wordcount
if word_count:
tweets_df_new.at[i, "word_count"] = len(
text_only_col.split()) # New column
# Emoji extraction
if emojis:
emoji_list = []
emoji_list = re.findall(
emoji.get_emoji_regexp(), tweet_text
) # Extracts hashtags and puts them in dict
# num_emojis = len(emoji_list)
if emoji_list == []:
emoji_list = ""
tweets_df_new.at[i, "emojis"] = emoji_list # New column
# Hashtag extraction
if hashtags:
hashtags = []
hashtags = re.findall(
".*?\\s(#\\w+).*?", tweet_text
) # Extracts hashtags and puts them in dict
# num_hashtags = len(hashtags)
if hashtags == []:
hashtags = ""
tweets_df_new.at[i, "hashtags"] = hashtags # New column
# Sentiment score generation
if sentiment:
tweets_df_new.at[i, "sentiment_polarity"] = mod.polarity_scores(
str(text_only_col)
)[
"compound"
] # New column
# flesch_readability
if flesch_readability:
tweets_df_new.at[
i, "flesch_readability_score"
] = textstat.flesch_reading_ease(
text_only_col
) # New column
# proportion_of_avg_retweets
if proportion_of_avg_retweets:
tweets_df_new.at[i, "prptn_rts_vs_avg"] = (
tweets_df_new.at[i, "retweet_count"] / avg_retweets
) # New column
# proportion_of_avg_favorites
if proportion_of_avg_favorites:
tweets_df_new.at[i, "proportion_favorites_vs_avg"] = (
tweets_df_new.at[i, "favorite_count"] / avg_favorites
) # New column
return tweets_df_new
[docs]def tweet_words(clean_dataframe, top_n=1):
"""
Returns the most common words and counts from a list of tweets.
The output is sorted descending by the count of words and in reverse
alphabetical order for any word ties.
Parameters
----------
clean_dataframe : pandas.DataFrame
A processed dataframe containing a user's
tweet history and associated information
top_n : int
An integer representing the the number of most common
words to display
Returns
-------
pandas.DataFrame
A dataframe with one column containing individual
words and a second column with the count of each word
Examples:
--------
#>>> tweet_words(dataframe, 3)
pd.DataFrame(data = {'words' : ['best', 'apple', 'news'],
'count' : [102, 52, 24]})
"""
# check input type of clean_dataframe
if not isinstance(clean_dataframe, pd.DataFrame):
raise TypeError("clean_dataframe should be of type pd.DataFrame")
# check input of top_n
if not isinstance(top_n, int):
raise TypeError("top_n should be of type Int")
# check if top_n is greater than 0
if top_n == 0:
raise ValueError("top_n must be greater than 0")
# keep only the necessary column to count words
clean_text_column = "text_only"
split_words_df = clean_dataframe[clean_text_column].str.split().explode()
output = split_words_df.value_counts().to_frame()
# index and column transformations
output["words"] = output.index
output.reset_index(inplace=True, drop=True)
output.rename(columns={"text_only": "count"}, inplace=True)
output = output[["words", "count"]]
# sort by alphabetical while preserving numerical sort
output = output.sort_values(["count", "words"], ascending=False)
output.reset_index(inplace=True, drop=True)
# select top_n
if top_n >= output.shape[0]:
output = output
else:
output = output.iloc[0:top_n, :]
return output
[docs]def sentiment_total(tweets, drop_sentiment=False):
"""
Takes an input of of single english words and outputs the number of words
associated with eight emotions and positive/negative sentiment. This is
based on the the crowd-sourced NRC Emotion Lexicon, which associates
words with eight basic emotions (anger, fear, anticipation, trust,
surprise, sadness, joy, and disgust) and two sentiments (negative and
positive). For more information on NRC:
http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
Note that words can be 0:n with emotions (either associated with none,
1, or many).
Parameters:
-------
data: pandas.DataFrame or np.array
A list or single column dataframe of single words.
drop_sentiment: boolean
drop emotion/sentiment rows if no words are
associated with them. Default is False.
Returns:
--------
pandas.DataFrame
Examples:
---------
#>>> sentiment(df, drop_sentiment = True)
3 x 5
sentiment word_count total_words
<chr> <int> <dbl>
anger 1 4
disgust 2 4
fear 1 4
negative 2 4
sadness 1 4
"""
tweet_words = pd.DataFrame({"word": tweets.str.split().explode()})
total_words = len(tweet_words)
emotion_lexicon_df = pd.read_csv(
"data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep="\t"
) # NRC dataset
tweet_words_sentiment = pd.merge(
tweet_words, emotion_lexicon_df, how="inner")
# if user deviates from default parameter drop 0 count sentiments
if drop_sentiment:
tweet_words_sentiment = tweet_words_sentiment[
tweet_words_sentiment["count"] == 1
]
# get aggregated sentiment-words counts
tweet_words_sentiment = tweet_words_sentiment.groupby(
["sentiment"], as_index=False
).sum()
tweet_words_sentiment = tweet_words_sentiment.rename(
columns={"count": "word_count"}
)
tweet_words_sentiment["total_words"] = total_words
return tweet_words_sentiment
[docs]def engagement_by_hour(tweets_df):
"""
Creates a line chart of average number of
likes and retweets received based on hour
of tweet posted.
Parameters
----------
tweets : pandas.DataFrame
A processed dataframe containing a user's tweet history
and associated information
Returns
-------
An Altair graph object (line chart) of average engagement
received by hour of tweet posted
Examples
--------
#>>> engagement_by_hour(tweets_df)
"""
# check input type of tweets_df
if not isinstance(tweets_df, pd.DataFrame):
raise TypeError("Input should be of type pd.DataFrame")
# Wrangle data
tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"])
tweets_df["hour"] = tweets_df.created_at.dt.hour
tweets_df["total_engagement"] = (
tweets_df["retweet_count"] + tweets_df["favorite_count"]
)
grouped_df = tweets_df.groupby(
"hour")["total_engagement"].mean().reset_index()
# Plot chart
chart = (
alt.Chart(
grouped_df,
title='''
Average engagement (likes + retweets) by hour
''') .mark_line() .encode(
alt.X("hour"),
alt.Y("total_engagement")))
return chart