Source code for tweepyclean.tweepyclean

import pandas as pd
import altair as alt
import re  # Needed for clean_tweets()
from nltk.sentiment.vader import (
    SentimentIntensityAnalyzer,
)  # Needed for clean_tweets()
import textstat  # Needed for clean_tweets()
import emoji  # Needed for clean_tweets()
import tweepy  # Needed to check for


[docs]def raw_df(tweepy_items):
    """
    Creates a dataframe with labeled columns from a
    tweepy.cursor.ItemIterator object.
    Parameters
    ----------
    tweepy_items : tweepy.cursor.ItemIterator
        Input Iterator object generated using the tweepy package

    Returns
    -------
    pd.DataFrame(tweet_search_results): pandas.core.frame.DataFrame
        Dataframe with up to 31 labeled columns based on the ItemIterator.

    Examples:
    --------
    #>>> raw_df(tweets)
    """
    if not isinstance(tweepy_items, tweepy.cursor.ItemIterator):
        raise TypeError(
            "tweepy_items should be of type tweepy.cursor.ItemIterator")

    import pandas as pd

    tweet_search_results = []
    for status in tweepy_items:
        tweet_search_results.append(status._json)

    return pd.DataFrame(tweet_search_results)


[docs]def clean_tweets(
    tweets_df,
    handle="",
    text_only=True,
    word_count=True,
    emojis=True,
    hashtags=True,
    sentiment=True,
    flesch_readability=True,
    proportion_of_avg_retweets=True,
    proportion_of_avg_favorites=True,
):
    """
    Adds new columns based on the data in the raw_df() pandas.dataframe output
    Parameters
    ----------
    raw_dataframe: pandas.core.frame.DataFrame
        Dataframe generated by raw_tweets() which will have columns added to it
    handle: string, optional
        String which adds adds a column containing the a specified twitter
        handle, (default is none and adds no column)
    text_only : bool, optional
        Bool which specifies to add a column of the tweet text containing
        no emojis, links, hashtags, or mentions (default is True)
    emojis: bool, optional
        Bool which specifies to add a column of the extracted emojis from
        tweet text and places them in their own column (default is True)
    hashtags: bool, optional
        Bool which specifies to add a column of the extracted hashtags
        from tweet text (default is True)
    sentiment: bool, optional
        Bool which specifies to add a column containing the
        nltk.sentiment.vader SentimentIntensityAnalyzer sentiment score
        for each tweet (default is True)
    flesch_readability: bool, optional
        Bool which specifies to add a column containing the textstat
        flesch readability score (default is True)
    proportion_of_avg_retweets: bool, optional
        Bool which specifies to add a column containing a proportion value
        of how many retweets a tweet received compared to the account
        average (default is True)
    proportion_of_avg_favorites: bool, optional
        Bool which specifies to add a column containing a proportion value
        of how many favorites a tweet received compared to the account
        average (default is True)

    Returns
    -------
    tweets_df_extra: pandas.core.frame.DataFrame
        Pandas dataframe containing the additional columns specified
        by the user.

    Examples
    --------
    #>>> extra_cols(tweets_df)
    #>>> extra_cols(tweets_df, flesch_readability = False)
    #>>> extra_cols(tweets_df, emojis = False, hashtags = False)
    #>>> extra_cols(tweets_df, sentiment = False)
    """
    if not isinstance(tweets_df, pd.DataFrame):
        raise TypeError("clean_dataframe should be of type pd.DataFrame")
    if not (isinstance(handle, str) or isinstance(handle, int)):
        raise TypeError("handle should be of type str or Int")
    if "retweet_count" not in tweets_df.columns:
        raise ValueError(
            """
            input dataframe does not contain retweet_count column needed to
            generate proportion_of_avg_retweets
            """
        )
    if "favorite_count" not in tweets_df.columns:
        raise ValueError(
            """
            input dataframe does not contain favorite_count column needed to
            generate proportion_of_avg_favorites
            """
        )
    if "full_text" not in tweets_df.columns:
        raise ValueError(
            """input dataframe does not contain full_text column needed to generate
            text_only and sentiment
            """)

    mod = SentimentIntensityAnalyzer()

    tweets_df_new = tweets_df.copy(deep=False)

    # Used to add user entered handle
    if handle != "":  # Need test
        handle_str = str(handle)
        tweets_df_new["handle"] = handle_str

    # Calculate average number of retweets/hearts for use in proportion later
    if proportion_of_avg_retweets:
        avg_retweets = tweets_df_new["retweet_count"].mean()
    if proportion_of_avg_favorites:
        avg_favorites = tweets_df_new["favorite_count"].mean()

    for i in range(len(tweets_df_new)):  # Loop to run through each tweet

        # entity = tweets_df_new.at[
        # i, "entities"
        # ]  # extract entities column containing hashtags.
        # Get text of current tweet
        tweet_text = tweets_df_new.at[i, "full_text"]

        # Text only extraction
        text_only_col = re.sub(
            r"\$\w+[,]|\@\w+|[,]\$\w+", "", tweet_text
        )  # removes '@'s
        text_only_col = re.sub(
            r"\$\w+[,]|\#\w+|[,]\$\w+", "", text_only_col
        )  # removes hashtags
        text_only_col = re.sub(
            r"http\S+", "", text_only_col)  # removes https links
        text_only_col = text_only_col.replace("\n", " ")  # remove \n newlines
        # text_only_col = give_emoji_free_text(text_only_col)
        text_only_col = re.sub(emoji.get_emoji_regexp(), r"", text_only_col)
        text_only_col = text_only_col.strip(":").lstrip()
        text_only_col = (
            text_only_col.lstrip().rstrip()
        )  # lstrip removes leading whitespace, rstrip trailing
        # Only store text_only if the user wants it
        if text_only:
            tweets_df_new.at[i, "text_only"] = text_only_col  # New column

        # Wordcount
        if word_count:
            tweets_df_new.at[i, "word_count"] = len(
                text_only_col.split())  # New column

        # Emoji extraction
        if emojis:
            emoji_list = []
            emoji_list = re.findall(
                emoji.get_emoji_regexp(), tweet_text
            )  # Extracts hashtags and puts them in dict
            # num_emojis = len(emoji_list)
            if emoji_list == []:
                emoji_list = ""
            tweets_df_new.at[i, "emojis"] = emoji_list  # New column

        # Hashtag extraction
        if hashtags:
            hashtags = []
            hashtags = re.findall(
                ".*?\\s(#\\w+).*?", tweet_text
            )  # Extracts hashtags and puts them in dict
            # num_hashtags = len(hashtags)
            if hashtags == []:
                hashtags = ""
            tweets_df_new.at[i, "hashtags"] = hashtags  # New column

        # Sentiment score generation
        if sentiment:
            tweets_df_new.at[i, "sentiment_polarity"] = mod.polarity_scores(
                str(text_only_col)
            )[
                "compound"
            ]  # New column

        # flesch_readability
        if flesch_readability:
            tweets_df_new.at[
                i, "flesch_readability_score"
            ] = textstat.flesch_reading_ease(
                text_only_col
            )  # New column

        # proportion_of_avg_retweets
        if proportion_of_avg_retweets:
            tweets_df_new.at[i, "prptn_rts_vs_avg"] = (
                tweets_df_new.at[i, "retweet_count"] / avg_retweets
            )  # New column

        # proportion_of_avg_favorites
        if proportion_of_avg_favorites:
            tweets_df_new.at[i, "proportion_favorites_vs_avg"] = (
                tweets_df_new.at[i, "favorite_count"] / avg_favorites
            )  # New column

    return tweets_df_new


[docs]def tweet_words(clean_dataframe, top_n=1):
    """
    Returns the most common words and counts from a list of tweets.

    The output is sorted descending by the count of words and in reverse
    alphabetical order for any word ties.


    Parameters
    ----------
    clean_dataframe : pandas.DataFrame
            A processed dataframe containing a user's
            tweet history and associated information
    top_n : int
            An integer representing the the number of most common
            words to display

    Returns
    -------
    pandas.DataFrame
            A dataframe with one column containing individual
            words and a second column with the count of each word

    Examples:
    --------
    #>>> tweet_words(dataframe, 3)
    pd.DataFrame(data = {'words' : ['best', 'apple', 'news'],
    'count' : [102, 52, 24]})
    """

    # check input type of clean_dataframe
    if not isinstance(clean_dataframe, pd.DataFrame):
        raise TypeError("clean_dataframe should be of type pd.DataFrame")

    # check input of top_n
    if not isinstance(top_n, int):
        raise TypeError("top_n should be of type Int")

    # check if top_n is greater than 0
    if top_n == 0:
        raise ValueError("top_n must be greater than 0")

    # keep only the necessary column to count words
    clean_text_column = "text_only"

    split_words_df = clean_dataframe[clean_text_column].str.split().explode()
    output = split_words_df.value_counts().to_frame()

    # index and column transformations
    output["words"] = output.index
    output.reset_index(inplace=True, drop=True)
    output.rename(columns={"text_only": "count"}, inplace=True)
    output = output[["words", "count"]]

    # sort by alphabetical while preserving numerical sort
    output = output.sort_values(["count", "words"], ascending=False)
    output.reset_index(inplace=True, drop=True)

    # select top_n
    if top_n >= output.shape[0]:
        output = output
    else:
        output = output.iloc[0:top_n, :]

    return output


[docs]def sentiment_total(tweets, drop_sentiment=False):
    """
    Takes an input of of single english words and outputs the number of words
    associated with eight emotions and positive/negative sentiment. This is
    based on the the crowd-sourced NRC Emotion Lexicon, which associates
    words with eight basic emotions (anger, fear, anticipation, trust,
    surprise, sadness, joy, and disgust) and two sentiments (negative and
    positive). For more information on NRC:
    http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

    Note that words can be 0:n with emotions (either associated with none,
    1, or many).

    Parameters:
    -------
    data: pandas.DataFrame or np.array
        A list or single column dataframe of single words.
    drop_sentiment: boolean
        drop emotion/sentiment rows if no words are
        associated with them. Default is False.

    Returns:
    --------
    pandas.DataFrame

    Examples:
    ---------
    #>>> sentiment(df, drop_sentiment = True)

    3 x 5
    sentiment      word_count  total_words
    <chr>          <int>       <dbl>
    anger          1            4
    disgust        2            4
    fear           1            4
    negative       2            4
    sadness        1            4
    """

    tweet_words = pd.DataFrame({"word": tweets.str.split().explode()})

    total_words = len(tweet_words)
    emotion_lexicon_df = pd.read_csv(
        "data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep="\t"
    )  # NRC dataset

    tweet_words_sentiment = pd.merge(
        tweet_words, emotion_lexicon_df, how="inner")

    # if user deviates from default parameter drop 0 count sentiments
    if drop_sentiment:
        tweet_words_sentiment = tweet_words_sentiment[
            tweet_words_sentiment["count"] == 1
        ]

    # get aggregated sentiment-words counts
    tweet_words_sentiment = tweet_words_sentiment.groupby(
        ["sentiment"], as_index=False
    ).sum()
    tweet_words_sentiment = tweet_words_sentiment.rename(
        columns={"count": "word_count"}
    )
    tweet_words_sentiment["total_words"] = total_words
    return tweet_words_sentiment


[docs]def engagement_by_hour(tweets_df):
    """
    Creates a line chart of average number of
    likes and retweets received based on hour
    of tweet posted.

    Parameters
    ----------
    tweets : pandas.DataFrame
        A processed dataframe containing a user's tweet history
        and associated information

    Returns
    -------
    An Altair graph object (line chart) of average engagement
    received by hour of tweet posted

    Examples
    --------
    #>>> engagement_by_hour(tweets_df)
    """

    # check input type of tweets_df
    if not isinstance(tweets_df, pd.DataFrame):
        raise TypeError("Input should be of type pd.DataFrame")

    # Wrangle data
    tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"])
    tweets_df["hour"] = tweets_df.created_at.dt.hour
    tweets_df["total_engagement"] = (
        tweets_df["retweet_count"] + tweets_df["favorite_count"]
    )
    grouped_df = tweets_df.groupby(
        "hour")["total_engagement"].mean().reset_index()

    # Plot chart
    chart = (
        alt.Chart(
            grouped_df,
            title='''
            Average engagement (likes + retweets) by hour
            ''') .mark_line() .encode(
            alt.X("hour"),
            alt.Y("total_engagement")))

    return chart