Skip to content
Snippets Groups Projects
Commit 4ae486d5 authored by mmzk1526's avatar mmzk1526
Browse files

Merge remote-tracking branch 'origin/main'

parents 6a3115b5 3f4b7e6a
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import pandas as pd
import re
nltk.download('stopwords')
data = pd.read_csv('data/train.csv', sep=",", escapechar="\\")
```
%% Cell type:code id: tags:
``` python
data['label'].value_counts().sort_index().plot(kind='bar', figsize=(4, 6))
plt.title('Distribution of Labels', fontsize=20)
plt.xlabel('Label', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(rotation='horizontal', fontsize=20)
plt.yticks(fontsize=15)
ax = plt.gca()
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', xytext=(0, 0), textcoords='offset points',fontsize=12)
plt.savefig("imgs/Distribution_of_Label.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
data['is_patronising'].value_counts().plot(kind='bar', figsize=(4, 6))
plt.title('Distribution of Binary Labels', fontsize=20)
plt.xlabel('Is Patronising', fontsize=20)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(rotation='horizontal', fontsize=20)
plt.yticks(fontsize=15)
ax = plt.gca()
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', xytext=(0, 0), textcoords='offset points', fontsize=12)
plt.savefig("imgs/Distribution_of_Binary_Label.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
keyword_patronising_ct = pd.crosstab(data['keyword'], data['is_patronising'])
ax = keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(10, 6))
# Setting the title and labels
plt.title('Distribution of Data Category by Patronising Status', fontsize=20)
plt.xlabel('Category', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45, fontsize=20)
plt.yticks(fontsize=20)
plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.82, 0.03), fontsize=15)
# Calculating ratios and annotating bars
for i, category in enumerate(keyword_patronising_ct.index):
false_count = keyword_patronising_ct.loc[category, False]
true_count = keyword_patronising_ct.loc[category, True]
ratio = false_count / true_count if true_count != 0 else 0
# Annotating the bar with the ratio value, positioned at the top of the bar
plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=15)
plt.savefig("imgs/Distribution_of_Keywords.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
keyword_patronising_ct = pd.crosstab(data['country_code'], data['is_patronising'])
keyword_patronising_ct.plot(kind='bar', stacked=True, color=['orange','mediumblue'], figsize=(12, 6))
plt.title('Distribution of Country Codes by Patronising Status', fontsize=20)
plt.xlabel('Country Code', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45, fontsize=20)
plt.yticks(fontsize=20)
plt.legend(title='Is Patronising', labels=['False', 'True'], loc=(0.83, 0.03), fontsize=15)
# Calculating ratios and annotating bars
for i, category in enumerate(keyword_patronising_ct.index):
false_count = keyword_patronising_ct.loc[category, False]
true_count = keyword_patronising_ct.loc[category, True]
ratio = false_count / true_count if true_count != 0 else 0
# Annotating the bar with the ratio value, positioned at the top of the bar
plt.text(i, false_count + true_count, f'{ratio:.2f}', ha='center', va='bottom', fontsize=12)
plt.savefig("imgs/Distribution_of_Country_Code.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
plt.hist(data['word_count'], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Word Count', fontsize=15)
plt.xlabel('Word Count', fontsize=15)
plt.ylabel('Log(Frequency)', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.yscale('log')
# sorted_word_counts = data.sort_values('word_count', ascending=False)
# # Print the sorted word counts
# print(sorted_word_counts[['word_count']].reset_index(drop=True))
# bins = np.arange(0, data['word_count'].max() + 20, 20)
# hist, bin_edges = np.histogram(data["word_count"], bins=bins)
# hist_dict = {bin_edges[i]: hist[i] for i in range(len(hist))}
# print(hist_dict)
plt.savefig("imgs/Distribution_of_Word_Count.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
stop = set(stopwords.words('english'))
def clean_text(text):
text = re.sub(r'[^a-zA-Z\s]', '', text)
words = [word for word in text.split() if word.lower() not in stop]
return words
# print("Top 20 Frequent words with PCL")
# print(data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20))
data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
plt.title('Top 20 Frequent Words in Patronising Samples', fontsize=20)
plt.ylabel('Word Frequency', fontsize=20)
plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=50, fontsize=18)
plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Frequency_PCL.jpg", bbox_inches='tight', dpi=150)
# print("Top 20 Frequent words with no PCL")
# print(data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20))
data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
plt.title('Top 20 Frequent Words in Non-Patronising Samples', fontsize=20)
plt.ylabel('Word Frequency', fontsize=20)
plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=50, fontsize=18)
plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Frequency_noPCL.jpg", bbox_inches='tight', dpi=150)
# data[data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
# plt.title('Top 20 Frequent Words in Patronising Samples', fontsize=20)
# plt.ylabel('Word Frequency', fontsize=20)
# plt.xlabel('Word', fontsize=20)
# plt.xticks(rotation=50, fontsize=18)
# plt.yticks(fontsize=18)
# plt.gcf().set_size_inches(12, 8)
# plt.savefig("imgs/Frequency_PCL.jpg", bbox_inches='tight', dpi=150)
# # print("Top 20 Frequent words with no PCL")
# # print(data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20))
# data[~data['is_patronising']]['text'].apply(clean_text).explode().value_counts().head(20).plot(kind='bar', figsize=(8, 6))
# plt.title('Top 20 Frequent Words in Non-Patronising Samples', fontsize=20)
# plt.ylabel('Word Frequency', fontsize=20)
# plt.xlabel('Word', fontsize=20)
# plt.xticks(rotation=50, fontsize=18)
# plt.yticks(fontsize=18)
# plt.gcf().set_size_inches(12, 8)
# plt.savefig("imgs/Frequency_noPCL.jpg", bbox_inches='tight', dpi=150)
# Split data into two groups
patronising_data = data[data['is_patronising'] == True]['text'].apply(clean_text).explode()
non_patronising_data = data[data['is_patronising'] == False]['text'].apply(clean_text).explode()
# Calculate word frequencies for each group
patronising_word_counts = patronising_data.value_counts()
non_patronising_word_counts = non_patronising_data.value_counts()
# Compare word frequencies
comparison_df = pd.DataFrame({
'Patronising': patronising_word_counts,
'Non-Patronising': non_patronising_word_counts
})
# Calculate the ratio or difference between frequencies
comparison_df['Ratio'] = comparison_df['Patronising'] / comparison_df['Non-Patronising']
comparison_df['Difference'] = comparison_df['Patronising'] - comparison_df['Non-Patronising']
# Sort by the most distinctive words for patronising texts
comparison_df = comparison_df.sort_values(by='Ratio', ascending=False)
print("Word Frequency Comparison (Top 20 words in patronising samples):")
# print(comparison_df.head(20))
# You may also want to visualize the results
import matplotlib.pyplot as plt
# Plotting the top 20 words with the highest ratio in patronising texts
comparison_df.head(20)['Ratio'].plot(kind='bar', figsize=(10, 6))
plt.title('Top 20 Distinctive Words in Patronising Samples', fontsize=20)
plt.ylabel('Ratio (Patronising / Non-Patronising)', fontsize=20)
plt.xlabel('Word', fontsize=20)
plt.xticks(rotation=65, fontsize=17)
plt.yticks(fontsize=18)
plt.gcf().set_size_inches(12, 8)
plt.savefig("imgs/Distintive_Words.jpg", bbox_inches='tight', dpi=150)
plt.show()
```
%% Cell type:code id: tags:
``` python
from scipy.stats import pointbiserialr, chi2_contingency
correlation, p_value = pointbiserialr(data['word_count'], data['is_patronising'])
print(f"Word Count: Correlation: {correlation}, P-value: {p_value}")
contingency_table = pd.crosstab(data['country_code'], data['is_patronising'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"country_code: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}")
contingency_table = pd.crosstab(data['keyword'], data['is_patronising'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"keyword: Chi-Square Statistic: {chi2}, ", f"P-value: {p_value}")
```
......
imgs/Distintive_Words.jpg

149 KiB | W: | H:

imgs/Distintive_Words.jpg

142 KiB | W: | H:

imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
imgs/Distintive_Words.jpg
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment