InΒ [34]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')
os.makedirs('../assets', exist_ok=True)
# Make plots look nice
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print("β
Libraries imported successfully!")
β Libraries imported successfully!
Function to Generate EmailsΒΆ
InΒ [35]:
def generate_sample_emails(n_samples=1000):
"""
Generate realistic customer support emails.
In a real project, you'd load actual data instead.
"""
np.random.seed(42) # For reproducibility
# Patterns for URGENT emails
urgent_subjects = [
'URGENT: Account locked',
'Cannot access my account!!!',
'Payment failed - CRITICAL',
'Security breach detected',
'Lost all my data - HELP',
'EMERGENCY: Service completely down',
'Account hacked - need immediate help',
'CRITICAL ERROR - Cannot login'
]
urgent_bodies = [
'I cannot access my account and I have an important meeting in 1 hour. This is extremely urgent! Please help ASAP.',
'My payment failed and now my account is locked. I need this fixed immediately!!!',
'I think my account has been hacked. All my data is gone. URGENT.',
'The entire service is down for my team. We cannot work. This is critical!',
'I lost access to all my files. Need help NOW!',
'CRITICAL: Cannot login and I have a deadline in 30 minutes!',
]
# Patterns for HIGH priority
high_subjects = [
'Feature not working properly',
'Error message when exporting',
'Bug in the dashboard',
'Cannot upload files',
'Integration stopped working',
'Getting error code 500'
]
high_bodies = [
'I am getting an error when I try to export my data. Can you help me fix this?',
'The reporting feature stopped working. I need this for my weekly report.',
'Files are not uploading correctly. This is affecting my workflow.',
'Getting an error code when I try to access the dashboard.',
'The integration with Slack is broken. Please look into this.',
]
# Patterns for MEDIUM priority
medium_subjects = [
'Question about billing',
'How to use advanced features?',
'Need clarification on pricing',
'Setup assistance needed',
'Best practices question',
'How do I configure this?'
]
medium_bodies = [
'I have a question about how the billing works for multiple users.',
'Can you explain how to set up the advanced filters? Thanks.',
'I need some clarification on the different pricing tiers.',
'What is the best way to organize my projects in the system?',
'Could you help me understand how the permissions work?',
]
# Patterns for LOW priority
low_subjects = [
'Feature suggestion',
'Feedback on recent update',
'Enhancement idea',
'Would be nice to have...',
'General feedback',
'Suggestion for improvement'
]
low_bodies = [
'It would be great if you could add a dark mode option in the future.',
'I have a suggestion for improving the user interface.',
'Overall the product is good. Here is some feedback for consideration.',
'Would be nice to have an export to PDF feature someday.',
'Consider adding keyboard shortcuts. Just a suggestion!',
]
# Create emails with realistic distribution
# Real customer support: 10% Urgent, 25% High, 40% Medium, 25% Low
emails = []
# Generate URGENT emails (10%)
n_urgent = int(n_samples * 0.10)
for i in range(n_urgent):
emails.append({
'email_id': i + 1,
'subject': np.random.choice(urgent_subjects),
'body': np.random.choice(urgent_bodies),
'priority': 'Urgent'
})
# Generate HIGH priority emails (25%)
n_high = int(n_samples * 0.25)
for i in range(n_high):
emails.append({
'email_id': len(emails) + 1,
'subject': np.random.choice(high_subjects),
'body': np.random.choice(high_bodies),
'priority': 'High'
})
# Generate MEDIUM priority emails (40%)
n_medium = int(n_samples * 0.40)
for i in range(n_medium):
emails.append({
'email_id': len(emails) + 1,
'subject': np.random.choice(medium_subjects),
'body': np.random.choice(medium_bodies),
'priority': 'Medium'
})
# Generate LOW priority emails (25%)
n_low = n_samples - len(emails) # Remaining
for i in range(n_low):
emails.append({
'email_id': len(emails) + 1,
'subject': np.random.choice(low_subjects),
'body': np.random.choice(low_bodies),
'priority': 'Low'
})
# Shuffle so they're not ordered by priority
np.random.shuffle(emails)
# Re-assign email IDs after shuffling
for i, email in enumerate(emails):
email['email_id'] = i + 1
return pd.DataFrame(emails)
# Generate the data
print("Generating sample emails...")
df = generate_sample_emails(1000)
# Save it
df.to_csv('../data/raw/emails.csv', index=False)
print(f"β
Generated {len(df)} emails")
print(f"β
Saved to data/raw/emails.csv")
print(f"\nDataset shape: {df.shape}")
Generating sample emails... β Generated 1000 emails β Saved to data/raw/emails.csv Dataset shape: (1000, 4)
InΒ [36]:
print("First 5 emails:")
print("="*80)
df.head()
First 5 emails: ================================================================================
Out[36]:
| email_id | subject | body | priority | |
|---|---|---|---|---|
| 0 | 1 | Feature not working properly | I am getting an error when I try to export my ... | High |
| 1 | 2 | Best practices question | What is the best way to organize my projects i... | Medium |
| 2 | 3 | How do I configure this? | What is the best way to organize my projects i... | Medium |
| 3 | 4 | Feature not working properly | Files are not uploading correctly. This is aff... | High |
| 4 | 5 | Account hacked - need immediate help | The entire service is down for my team. We can... | Urgent |
Read some actual examplesΒΆ
InΒ [37]:
print("Let's read some actual emails to understand what we're working with:\n")
for priority in ['Urgent', 'High', 'Medium', 'Low']:
print("-"*80)
print(f"π§ {priority.upper()} PRIORITY EMAIL EXAMPLE")
# Get one random example
sample = df[df['priority'] == priority].sample(1).iloc[0]
print(f"\nπ Subject: {sample['subject']}")
print(f"π Body:\n{sample['body']}\n")
Let's read some actual emails to understand what we're working with: -------------------------------------------------------------------------------- π§ URGENT PRIORITY EMAIL EXAMPLE π Subject: EMERGENCY: Service completely down π Body: CRITICAL: Cannot login and I have a deadline in 30 minutes! -------------------------------------------------------------------------------- π§ HIGH PRIORITY EMAIL EXAMPLE π Subject: Error message when exporting π Body: I am getting an error when I try to export my data. Can you help me fix this? -------------------------------------------------------------------------------- π§ MEDIUM PRIORITY EMAIL EXAMPLE π Subject: How do I configure this? π Body: I have a question about how the billing works for multiple users. -------------------------------------------------------------------------------- π§ LOW PRIORITY EMAIL EXAMPLE π Subject: Suggestion for improvement π Body: I have a suggestion for improving the user interface.
Basic InformationΒΆ
InΒ [38]:
print("Dataset Information:")
print("-"*50)
print(f"Total emails: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")
print("\n" + "="*50)
print("Data types:")
print(df.dtypes)
print("\n" + "="*50)
print("Missing values:")
print(df.isnull().sum())
Dataset Information: -------------------------------------------------- Total emails: 1000 Number of columns: 4 Column names: ['email_id', 'subject', 'body', 'priority'] ================================================== Data types: email_id int64 subject str body str priority str dtype: object ================================================== Missing values: email_id 0 subject 0 body 0 priority 0 dtype: int64
Ensuring Priority DistributionΒΆ
InΒ [39]:
print("Priority Distribution:")
print("-"*50)
# Count each priority
counts = df['priority'].value_counts()
print("\nCounts:")
print(counts)
print("\nPercentages:")
percentages = df['priority'].value_counts(normalize=True) * 100
for priority, pct in percentages.items():
print(f"{priority:10s}: {pct:5.1f}%")
Priority Distribution: -------------------------------------------------- Counts: priority Medium 400 High 250 Low 250 Urgent 100 Name: count, dtype: int64 Percentages: Medium : 40.0% High : 25.0% Low : 25.0% Urgent : 10.0%
Visual of Priority DistributionΒΆ
InΒ [40]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Bar chart
colors = {'Urgent': '#e74c3c', 'High': '#e67e22', 'Medium': '#f39c12', 'Low': '#2ecc71'}
priority_counts = df['priority'].value_counts()
bars = axes[0].bar(priority_counts.index, priority_counts.values)
# Color each bar
for bar, priority in zip(bars, priority_counts.index):
bar.set_color(colors[priority])
axes[0].set_title('Email Count by Priority', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Priority', fontsize=12)
axes[0].set_ylabel('Number of Emails', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)
# Add count labels on bars
for i, (priority, count) in enumerate(priority_counts.items()):
axes[0].text(i, count + 10, str(count), ha='center', fontweight='bold')
# Pie chart
pie_colors = [colors[p] for p in priority_counts.index]
axes[1].pie(priority_counts.values, labels=priority_counts.index, autopct='%1.1f%%',
colors=pie_colors, startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Priority Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
fig.savefig('../assets/priority_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nβ οΈ OBSERVATION: Class Imbalance Detected!")
print(" Urgent emails are only ~10% of the data")
print(" This means we need to use 'class_weight=balanced' when training")
β οΈ OBSERVATION: Class Imbalance Detected! Urgent emails are only ~10% of the data This means we need to use 'class_weight=balanced' when training
Text AnalysisΒΆ
InΒ [41]:
# Calculate lengths
df['subject_length'] = df['subject'].str.len()
df['body_length'] = df['body'].str.len()
df['word_count'] = df['body'].str.split().str.len()
print("Text Length Statistics:")
print("-"*50)
print(df[['subject_length', 'body_length', 'word_count']].describe())
print("\n\nBy Priority:")
print("-"*50)
print(df.groupby('priority')[['subject_length', 'body_length', 'word_count']].mean())
Text Length Statistics:
--------------------------------------------------
subject_length body_length word_count
count 1000.000000 1000.000000 1000.00000
mean 23.898000 62.314000 11.33800
std 4.241885 9.871416 2.75415
min 16.000000 45.000000 7.00000
25% 22.000000 57.000000 9.00000
50% 24.000000 59.000000 11.00000
75% 27.000000 68.000000 12.00000
max 36.000000 113.000000 21.00000
By Priority:
--------------------------------------------------
subject_length body_length word_count
priority
High 24.0400 66.664 12.4280
Low 20.6920 59.280 10.4480
Medium 24.9025 58.965 10.5825
Urgent 27.5400 72.420 13.8600
Visualize Text LengthsΒΆ
InΒ [42]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Box plots for each metric
for idx, (column, title) in enumerate([
('subject_length', 'Subject Length'),
('body_length', 'Body Length'),
('word_count', 'Word Count')
]):
# Create box plot
df.boxplot(column=column, by='priority', ax=axes[idx])
axes[idx].set_title(title + ' by Priority', fontsize=12, fontweight='bold')
axes[idx].set_xlabel('Priority', fontsize=10)
axes[idx].set_ylabel('Count', fontsize=10)
axes[idx].get_figure().suptitle('') # Remove automatic title
plt.tight_layout()
fig.savefig('../assets/text_length_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nπ OBSERVATION:")
print(" Text length alone doesn't strongly distinguish priorities")
print(" We need to look at the CONTENT, not just the length!")
π OBSERVATION: Text length alone doesn't strongly distinguish priorities We need to look at the CONTENT, not just the length!
Count Urgency SignalsΒΆ
InΒ [43]:
def count_urgency_signals(text):
"""Count various urgency indicators in text"""
text_lower = text.lower()
return {
'exclamations': text.count('!'),
'caps_words': len(re.findall(r'\b[A-Z]{2,}\b', text)),
'urgent_words': len(re.findall(
r'\b(urgent|asap|critical|emergency|immediately|now|help|asap)\b',
text_lower
)),
'questions': text.count('?')
}
# Apply to all emails
print("Analyzing urgency signals in emails...")
urgency_df = df['body'].apply(count_urgency_signals).apply(pd.Series)
# Combine with original data
df_analysis = pd.concat([df, urgency_df], axis=1)
# Calculate averages by priority
urgency_by_priority = df_analysis.groupby('priority')[
['exclamations', 'caps_words', 'urgent_words', 'questions']
].mean()
print("\n" + "="*70)
print("AVERAGE URGENCY INDICATORS BY PRIORITY")
print("="*70)
print(urgency_by_priority)
Analyzing urgency signals in emails...
======================================================================
AVERAGE URGENCY INDICATORS BY PRIORITY
======================================================================
exclamations caps_words urgent_words questions
priority
High 0.00 0.000 0.232 0.232
Low 0.22 0.184 0.000 0.000
Medium 0.00 0.000 0.185 0.565
Urgent 1.17 0.610 1.640 0.000
Visulize Urgency IndicatorΒΆ
InΒ [44]:
urgency_by_priority.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Urgency Indicators by Priority', fontsize=14, fontweight='bold')
plt.xlabel('Priority', fontsize=12)
plt.ylabel('Average Count', fontsize=12)
plt.legend(title='Indicator', fontsize=10)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
fig.savefig('../assets/urgency_indicators.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nπ― KEY FINDING:")
print(" β Urgent emails have MORE exclamation marks")
print(" β Urgent emails contain MORE urgent keywords")
print(" β These will be IMPORTANT FEATURES for our model!")
π― KEY FINDING: β Urgent emails have MORE exclamation marks β Urgent emails contain MORE urgent keywords β These will be IMPORTANT FEATURES for our model!
Find the most common wordsΒΆ
InΒ [45]:
def get_common_words(texts, n=15):
"""Extract most common meaningful words"""
# Combine all texts
all_text = ' '.join(texts).lower()
# Extract words (3+ letters, only letters)
words = re.findall(r'\b[a-z]{3,}\b', all_text)
# Remove common stop words
stop_words = {
'the', 'and', 'for', 'this', 'that', 'with', 'from',
'have', 'are', 'not', 'you', 'can', 'need', 'my'
}
words = [w for w in words if w not in stop_words]
return Counter(words).most_common(n)
print("="*70)
print("MOST COMMON WORDS BY PRIORITY")
print("="*70)
for priority in ['Urgent', 'High', 'Medium', 'Low']:
print(f"\n{'='*70}")
print(f"π {priority.upper()} Priority")
print(f"{'='*70}")
priority_texts = df[df['priority'] == priority]['body']
top_words = get_common_words(priority_texts, n=10)
for word, count in top_words:
bar = 'β' * (count // 3) # Visual bar
print(f" {word:15s} {count:3d} {bar}")
====================================================================== MOST COMMON WORDS BY PRIORITY ====================================================================== ====================================================================== π URGENT Priority ====================================================================== cannot 55 ββββββββββββββββββ account 44 ββββββββββββββ critical 39 βββββββββββββ access 33 βββββββββββ help 33 βββββββββββ now 32 ββββββββββ all 30 ββββββββββ urgent 29 βββββββββ entire 24 ββββββββ service 24 ββββββββ ====================================================================== π HIGH Priority ====================================================================== getting 105 βββββββββββββββββββββββββββββββββββ error 105 βββββββββββββββββββββββββββββββββββ when 105 βββββββββββββββββββββββββββββββββββ try 105 βββββββββββββββββββββββββββββββββββ export 58 βββββββββββββββββββ data 58 βββββββββββββββββββ help 58 βββββββββββββββββββ fix 58 βββββββββββββββββββ files 49 ββββββββββββββββ uploading 49 ββββββββββββββββ ====================================================================== π MEDIUM Priority ====================================================================== how 249 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ question 88 βββββββββββββββββββββββββββββ about 88 βββββββββββββββββββββββββββββ billing 88 βββββββββββββββββββββββββββββ works 88 βββββββββββββββββββββββββββββ multiple 88 βββββββββββββββββββββββββββββ users 88 βββββββββββββββββββββββββββββ explain 87 βββββββββββββββββββββββββββββ set 87 βββββββββββββββββββββββββββββ advanced 87 βββββββββββββββββββββββββββββ ====================================================================== π LOW Priority ====================================================================== suggestion 112 βββββββββββββββββββββββββββββββββββββ would 95 βββββββββββββββββββββββββββββββ improving 57 βββββββββββββββββββ user 57 βββββββββββββββββββ interface 57 βββββββββββββββββββ consider 55 ββββββββββββββββββ adding 55 ββββββββββββββββββ keyboard 55 ββββββββββββββββββ shortcuts 55 ββββββββββββββββββ just 55 ββββββββββββββββββ
Summary of FindingsΒΆ
InΒ [46]:
print("="*70)
print("π EXPLORATION SUMMARY")
print("="*70)
print("\nβ
DATA QUALITY")
print(" β’ No missing values")
print(f" β’ Total emails: {len(df)}")
print(" β’ All text fields populated")
print("\nβ οΈ CLASS IMBALANCE DETECTED")
for priority, pct in df['priority'].value_counts(normalize=True).items():
symbol = "π΄" if priority == "Urgent" else "π‘" if priority == "High" else "π’"
print(f" {symbol} {priority:8s}: {pct*100:5.1f}%")
print(" β Action: Use class_weight='balanced' in model")
print("\nπ― DISTINGUISHING FEATURES FOUND")
print(" 1. Exclamation marks (more in urgent)")
print(" 2. Urgent keywords (urgent, asap, critical, emergency)")
print(" 3. CAPS words (more in urgent)")
print(" 4. Vocabulary differences (clear patterns)")
print("\nπ OBSERVATIONS")
print(" β’ Text length NOT very predictive")
print(" β’ Content/keywords ARE very predictive")
print(" β’ Clear patterns exist between priority levels")
print("\n⨠NEXT STEPS")
print(" 1. β
Data exploration - COMPLETE")
print(" 2. βοΈ Feature engineering (TF-IDF + custom features)")
print(" 3. βοΈ Model training (Logistic Regression)")
print(" 4. βοΈ Evaluation (focus on Urgent recall)")
print("\n" + "="*70)
====================================================================== π EXPLORATION SUMMARY ====================================================================== β DATA QUALITY β’ No missing values β’ Total emails: 1000 β’ All text fields populated β οΈ CLASS IMBALANCE DETECTED π’ Medium : 40.0% π‘ High : 25.0% π’ Low : 25.0% π΄ Urgent : 10.0% β Action: Use class_weight='balanced' in model π― DISTINGUISHING FEATURES FOUND 1. Exclamation marks (more in urgent) 2. Urgent keywords (urgent, asap, critical, emergency) 3. CAPS words (more in urgent) 4. Vocabulary differences (clear patterns) π OBSERVATIONS β’ Text length NOT very predictive β’ Content/keywords ARE very predictive β’ Clear patterns exist between priority levels β¨ NEXT STEPS 1. β Data exploration - COMPLETE 2. βοΈ Feature engineering (TF-IDF + custom features) 3. βοΈ Model training (Logistic Regression) 4. βοΈ Evaluation (focus on Urgent recall) ======================================================================
Save workΒΆ
InΒ [47]:
df_analysis.to_csv('../data/processed/emails_with_features.csv', index=False)
print("β
Saved analyzed data to: data/processed/emails_with_features.csv")
print("\nπ Don't forget to save this notebook! (Ctrl+S or Cmd+S)")
β Saved analyzed data to: data/processed/emails_with_features.csv π Don't forget to save this notebook! (Ctrl+S or Cmd+S)