Skip to content
Snippets Groups Projects
Commit 4ae486d5 authored by mmzk1526's avatar mmzk1526
Browse files

Merge remote-tracking branch 'origin/main'

parents 6a3115b5 3f4b7e6a
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import nltk import nltk
from nltk.corpus import stopwords from nltk.corpus import stopwords
import pandas as pd import pandas as pd
import re import re
nltk.download('stopwords') nltk.download('stopwords')
data = pd.read_csv('data/train.csv', sep=",", escapechar="\\") data = pd.read_csv('data/train.csv', sep=",", escapechar="\\")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data['label'].value_counts().sort_index().plot(kind='bar', figsize=(4, 6)) data['label'].value_counts().sort_index().plot(kind='bar', figsize=(4, 6))
plt.title('Distribution of Labels', fontsize=20) plt.title('Distribution of Labels', fontsize=20)
plt.xlabel('Label', fontsize=18) plt.xlabel('Label', fontsize=18)
plt.ylabel('Frequency', fontsize=18) plt.ylabel('Frequency', fontsize=18)
plt.xticks(rotation='horizontal', fontsize=20) plt.xticks(rotation='horizontal', fontsize=20)
plt.yticks(fontsize=15) plt.yticks(fontsize=15)
ax = plt.gca() ax = plt.gca()
for p in ax.patches: for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', xytext=(0, 0), textcoords='offset points',fontsize=12) ha='center', va='bottom', xytext=(0, 0), textcoords='offset points',fontsize=12)
plt.savefig("imgs/Distribution_of_Label.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distribution_of_Label.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data['is_patronising'].value_counts().plot(kind='bar', figsize=(4, 6)) data['is_patronising'].value_counts().plot(kind='bar', figsize=(4, 6))
plt.title('Distribution of Binary Labels', fontsize=20) plt.title('Distribution of Binary Labels', fontsize=20)
plt.xlabel('Is Patronising', fontsize=20) plt.xlabel('Is Patronising', fontsize=20)
plt.ylabel('Frequency', fontsize=18) plt.ylabel('Frequency', fontsize=18)
plt.xticks(rotation='horizontal', fontsize=20) plt.xticks(rotation='horizontal', fontsize=20)
plt.yticks(fontsize=15) plt.yticks(fontsize=15)
ax = plt.gca() ax = plt.gca()
for p in ax.patches: for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', xytext=(0, 0), textcoords='offset points', fontsize=12) ha='center', va='bottom', xytext=(0, 0), textcoords='offset points', fontsize=12)
plt.savefig("imgs/Distribution_of_Binary_Label.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distribution_of_Binary_Label.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
keyword_patronising_ct = pd.crosstab(data['keyword'], data['is_patronising']) keyword_patronising_ct = pd.crosstab(data['keyword'], data['is_patronising'])
ax = keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(10, 6)) ax = keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(10, 6))
# Setting the title and labels # Setting the title and labels
plt.title('Distribution of Data Category by Patronising Status', fontsize=20) plt.title('Distribution of Data Category by Patronising Status', fontsize=20)
plt.xlabel('Category', fontsize=20) plt.xlabel('Category', fontsize=20)
plt.ylabel('Frequency', fontsize=20) plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45, fontsize=20) plt.xticks(rotation=45, fontsize=20)
plt.yticks(fontsize=20) plt.yticks(fontsize=20)
plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.82, 0.03), fontsize=15) plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.82, 0.03), fontsize=15)
# Calculating ratios and annotating bars # Calculating ratios and annotating bars
for i, category in enumerate(keyword_patronising_ct.index): for i, category in enumerate(keyword_patronising_ct.index):
false_count = keyword_patronising_ct.loc[category, False] false_count = keyword_patronising_ct.loc[category, False]
true_count = keyword_patronising_ct.loc[category, True] true_count = keyword_patronising_ct.loc[category, True]
ratio = false_count / true_count if true_count != 0 else 0 ratio = false_count / true_count if true_count != 0 else 0
# Annotating the bar with the ratio value, positioned at the top of the bar # Annotating the bar with the ratio value, positioned at the top of the bar
plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=15) plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=15)
plt.savefig("imgs/Distribution_of_Keywords.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distribution_of_Keywords.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
keyword_patronising_ct = pd.crosstab(data['country_code'], data['is_patronising']) keyword_patronising_ct = pd.crosstab(data['country_code'], data['is_patronising'])
keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(12, 6)) keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(12, 6))
plt.title('Distribution of Country Codes by Patronising Status', fontsize=20) plt.title('Distribution of Country Codes by Patronising Status', fontsize=20)
plt.xlabel('Country Code', fontsize=20) plt.xlabel('Country Code', fontsize=20)
plt.ylabel('Frequency', fontsize=20) plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45, fontsize=20) plt.xticks(rotation=45, fontsize=20)
plt.yticks(fontsize=20) plt.yticks(fontsize=20)
plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.83, 0.03), fontsize=15) plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.83, 0.03), fontsize=15)
# Calculating ratios and annotating bars # Calculating ratios and annotating bars
for i, category in enumerate(keyword_patronising_ct.index): for i, category in enumerate(keyword_patronising_ct.index):
false_count = keyword_patronising_ct.loc[category, False] false_count = keyword_patronising_ct.loc[category, False]
true_count = keyword_patronising_ct.loc[category, True] true_count = keyword_patronising_ct.loc[category, True]
ratio = false_count / true_count if true_count != 0 else 0 ratio = false_count / true_count if true_count != 0 else 0
# Annotating the bar with the ratio value, positioned at the top of the bar # Annotating the bar with the ratio value, positioned at the top of the bar
plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=12) plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=12)
plt.savefig("imgs/Distribution_of_Country_Code.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distribution_of_Country_Code.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data['word_count'] = data['text'].apply(lambda x: len(x.split())) data['word_count'] = data['text'].apply(lambda x: len(x.split()))
plt.hist(data['word_count'], bins=20, color='blue', alpha=0.7) plt.hist(data['word_count'], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Word Count', fontsize=15) plt.title('Distribution of Word Count', fontsize=15)
plt.xlabel('Word Count', fontsize=15) plt.xlabel('Word Count', fontsize=15)
plt.ylabel('Log(Frequency)', fontsize=15) plt.ylabel('Log(Frequency)', fontsize=15)
plt.xticks(fontsize=15) plt.xticks(fontsize=15)
plt.yticks(fontsize=15) plt.yticks(fontsize=15)
plt.yscale('log') plt.yscale('log')
# sorted_word_counts = data.sort_values('word_count', ascending=False) # sorted_word_counts = data.sort_values('word_count', ascending=False)
# # Print the sorted word counts # # Print the sorted word counts
# print(sorted_word_counts[['word_count']].reset_index(drop=True)) # print(sorted_word_counts[['word_count']].reset_index(drop=True))
# bins = np.arange(0, data['word_count'].max() + 20, 20) # bins = np.arange(0, data['word_count'].max() + 20, 20)
# hist, bin_edges = np.histogram(data["word_count"], bins=bins) # hist, bin_edges = np.histogram(data["word_count"], bins=bins)
# hist_dict = {bin_edges[i]: hist[i] for i in range(len(hist))} # hist_dict = {bin_edges[i]: hist[i] for i in range(len(hist))}
# print(hist_dict) # print(hist_dict)
plt.savefig("imgs/Distribution_of_Word_Count.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distribution_of_Word_Count.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
stop = set(stopwords.words('english')) stop = set(stopwords.words('english'))
def clean_text(text): def clean_text(text):
text = re.sub(r'[^a-zA-Z\s]', '', text) text = re.sub(r'[^a-zA-Z\s]', '', text)
words = [word for word in text.split() if word.lower() not in stop] words = [word for word in text.split() if word.lower() not in stop]
return words return words
# print("Top 20 Frequent words with PCL") # print("Top 20 Frequent words with PCL")
# print(data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20)) # print(data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20))
data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6)) # data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
plt.title('Top 20 Frequent Words in Patronising Samples', fontsize=20) # plt.title('Top 20 Frequent Words in Patronising Samples', fontsize=20)
plt.ylabel('Word Frequency', fontsize=20) # plt.ylabel('Word Frequency', fontsize=20)
plt.xlabel('Word', fontsize=20) # plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=50, fontsize=18) # plt.xticks(rotation=50, fontsize=18)
plt.yticks(fontsize=18) # plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8) # plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Frequency_PCL.jpg", bbox_inches='tight', dpi=150) # plt.savefig("imgs/Frequency_PCL.jpg", bbox_inches='tight', dpi=150)
# print("Top 20 Frequent words with no PCL") # # print("Top 20 Frequent words with no PCL")
# print(data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20)) # # print(data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20))
data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6)) # data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
plt.title('Top 20 Frequent Words in Non-Patronising Samples', fontsize=20) # plt.title('Top 20 Frequent Words in Non-Patronising Samples', fontsize=20)
plt.ylabel('Word Frequency', fontsize=20) # plt.ylabel('Word Frequency', fontsize=20)
plt.xlabel('Word', fontsize=20) # plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=50, fontsize=18) # plt.xticks(rotation=50, fontsize=18)
plt.yticks(fontsize=18) # plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8) # plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Frequency_noPCL.jpg", bbox_inches='tight', dpi=150) # plt.savefig("imgs/Frequency_noPCL.jpg", bbox_inches='tight', dpi=150)
# Split data into two groups # Split data into two groups
patronising_data = data[data['is_patronising'] == True]['text'].apply(clean_text).explode() patronising_data = data[data['is_patronising'] == True]['text'].apply(clean_text).explode()
non_patronising_data = data[data['is_patronising'] == False]['text'].apply(clean_text).explode() non_patronising_data = data[data['is_patronising'] == False]['text'].apply(clean_text).explode()
# Calculate word frequencies for each group # Calculate word frequencies for each group
patronising_word_counts = patronising_data.value_counts() patronising_word_counts = patronising_data.value_counts()
non_patronising_word_counts = non_patronising_data.value_counts() non_patronising_word_counts = non_patronising_data.value_counts()
# Compare word frequencies # Compare word frequencies
comparison_df = pd.DataFrame({ comparison_df = pd.DataFrame({
'Patronising': patronising_word_counts, 'Patronising': patronising_word_counts,
'Non-Patronising': non_patronising_word_counts 'Non-Patronising': non_patronising_word_counts
}) })
# Calculate the ratio or difference between frequencies # Calculate the ratio or difference between frequencies
comparison_df['Ratio'] = comparison_df['Patronising'] / comparison_df['Non-Patronising'] comparison_df['Ratio'] = comparison_df['Patronising'] / comparison_df['Non-Patronising']
comparison_df['Difference'] = comparison_df['Patronising'] - comparison_df['Non-Patronising'] comparison_df['Difference'] = comparison_df['Patronising'] - comparison_df['Non-Patronising']
# Sort by the most distinctive words for patronising texts # Sort by the most distinctive words for patronising texts
comparison_df = comparison_df.sort_values(by='Ratio', ascending=False) comparison_df = comparison_df.sort_values(by='Ratio', ascending=False)
print("Word Frequency Comparison (Top 20 words in patronising samples):") print("Word Frequency Comparison (Top 20 words in patronising samples):")
# print(comparison_df.head(20)) # print(comparison_df.head(20))
# You may also want to visualize the results # You may also want to visualize the results
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# Plotting the top 20 words with the highest ratio in patronising texts # Plotting the top 20 words with the highest ratio in patronising texts
comparison_df.head(20)['Ratio'].plot(kind='bar', figsize=(10, 6)) comparison_df.head(20)['Ratio'].plot(kind='bar', figsize=(10, 6))
plt.title('Top 20 Distinctive Words in Patronising Samples', fontsize=20) plt.title('Top 20 Distinctive Words in Patronising Samples', fontsize=20)
plt.ylabel('Ratio (Patronising / Non-Patronising)', fontsize=20) plt.ylabel('Ratio (Patronising / Non-Patronising)', fontsize=20)
plt.xlabel('Word', fontsize=20) plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=65, fontsize=17) plt.xticks(rotation=65, fontsize=17)
plt.yticks(fontsize=18) plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8) plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Distintive_Words.jpg", bbox_inches='tight', dpi=150) plt.savefig("imgs/Distintive_Words.jpg", bbox_inches='tight', dpi=150)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from scipy.stats import pointbiserialr, chi2_contingency from scipy.stats import pointbiserialr, chi2_contingency
correlation, p_value = pointbiserialr(data['word_count'], data['is_patronising']) correlation, p_value = pointbiserialr(data['word_count'], data['is_patronising'])
print(f"Word Count: Correlation: {correlation}, P-value: {p_value}") print(f"Word Count: Correlation: {correlation}, P-value: {p_value}")
contingency_table = pd.crosstab(data['country_code'], data['is_patronising']) contingency_table = pd.crosstab(data['country_code'], data['is_patronising'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table) chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"country_code: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}") print(f"country_code: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}")
contingency_table = pd.crosstab(data['keyword'], data['is_patronising']) contingency_table = pd.crosstab(data['keyword'], data['is_patronising'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table) chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"keyword: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}") print(f"keyword: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}")
``` ```
......
imgs/Distintive_Words.jpg

149 KiB | W: | H:

imgs/Distintive_Words.jpg

142 KiB | W: | H:

imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment