import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

os.makedirs('../assets', exist_ok=True)

# Make plots look nice
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!

def generate_sample_emails(n_samples=1000):
    """
    Generate realistic customer support emails.
    In a real project, you'd load actual data instead.
    """
    
    np.random.seed(42)  # For reproducibility
    
    # Patterns for URGENT emails
    urgent_subjects = [
        'URGENT: Account locked',
        'Cannot access my account!!!',
        'Payment failed - CRITICAL',
        'Security breach detected',
        'Lost all my data - HELP',
        'EMERGENCY: Service completely down',
        'Account hacked - need immediate help',
        'CRITICAL ERROR - Cannot login'
    ]
    
    urgent_bodies = [
        'I cannot access my account and I have an important meeting in 1 hour. This is extremely urgent! Please help ASAP.',
        'My payment failed and now my account is locked. I need this fixed immediately!!!',
        'I think my account has been hacked. All my data is gone. URGENT.',
        'The entire service is down for my team. We cannot work. This is critical!',
        'I lost access to all my files. Need help NOW!',
        'CRITICAL: Cannot login and I have a deadline in 30 minutes!',
    ]
    
    # Patterns for HIGH priority
    high_subjects = [
        'Feature not working properly',
        'Error message when exporting',
        'Bug in the dashboard',
        'Cannot upload files',
        'Integration stopped working',
        'Getting error code 500'
    ]
    
    high_bodies = [
        'I am getting an error when I try to export my data. Can you help me fix this?',
        'The reporting feature stopped working. I need this for my weekly report.',
        'Files are not uploading correctly. This is affecting my workflow.',
        'Getting an error code when I try to access the dashboard.',
        'The integration with Slack is broken. Please look into this.',
    ]
    
    # Patterns for MEDIUM priority
    medium_subjects = [
        'Question about billing',
        'How to use advanced features?',
        'Need clarification on pricing',
        'Setup assistance needed',
        'Best practices question',
        'How do I configure this?'
    ]
    
    medium_bodies = [
        'I have a question about how the billing works for multiple users.',
        'Can you explain how to set up the advanced filters? Thanks.',
        'I need some clarification on the different pricing tiers.',
        'What is the best way to organize my projects in the system?',
        'Could you help me understand how the permissions work?',
    ]
    
    # Patterns for LOW priority
    low_subjects = [
        'Feature suggestion',
        'Feedback on recent update',
        'Enhancement idea',
        'Would be nice to have...',
        'General feedback',
        'Suggestion for improvement'
    ]
    
    low_bodies = [
        'It would be great if you could add a dark mode option in the future.',
        'I have a suggestion for improving the user interface.',
        'Overall the product is good. Here is some feedback for consideration.',
        'Would be nice to have an export to PDF feature someday.',
        'Consider adding keyboard shortcuts. Just a suggestion!',
    ]
    
    # Create emails with realistic distribution
    # Real customer support: 10% Urgent, 25% High, 40% Medium, 25% Low
    
    emails = []
    
    # Generate URGENT emails (10%)
    n_urgent = int(n_samples * 0.10)
    for i in range(n_urgent):
        emails.append({
            'email_id': i + 1,
            'subject': np.random.choice(urgent_subjects),
            'body': np.random.choice(urgent_bodies),
            'priority': 'Urgent'
        })
    
    # Generate HIGH priority emails (25%)
    n_high = int(n_samples * 0.25)
    for i in range(n_high):
        emails.append({
            'email_id': len(emails) + 1,
            'subject': np.random.choice(high_subjects),
            'body': np.random.choice(high_bodies),
            'priority': 'High'
        })
    
    # Generate MEDIUM priority emails (40%)
    n_medium = int(n_samples * 0.40)
    for i in range(n_medium):
        emails.append({
            'email_id': len(emails) + 1,
            'subject': np.random.choice(medium_subjects),
            'body': np.random.choice(medium_bodies),
            'priority': 'Medium'
        })
    
    # Generate LOW priority emails (25%)
    n_low = n_samples - len(emails)  # Remaining
    for i in range(n_low):
        emails.append({
            'email_id': len(emails) + 1,
            'subject': np.random.choice(low_subjects),
            'body': np.random.choice(low_bodies),
            'priority': 'Low'
        })
    
    # Shuffle so they're not ordered by priority
    np.random.shuffle(emails)
    
    # Re-assign email IDs after shuffling
    for i, email in enumerate(emails):
        email['email_id'] = i + 1
    
    return pd.DataFrame(emails)

# Generate the data
print("Generating sample emails...")
df = generate_sample_emails(1000)

# Save it
df.to_csv('../data/raw/emails.csv', index=False)

print(f"✅ Generated {len(df)} emails")
print(f"✅ Saved to data/raw/emails.csv")
print(f"\nDataset shape: {df.shape}")

Generating sample emails...
✅ Generated 1000 emails
✅ Saved to data/raw/emails.csv

Dataset shape: (1000, 4)

print("First 5 emails:")
print("="*80)
df.head()

First 5 emails:
================================================================================

print("Let's read some actual emails to understand what we're working with:\n")

for priority in ['Urgent', 'High', 'Medium', 'Low']:
    print("-"*80)
    print(f"📧 {priority.upper()} PRIORITY EMAIL EXAMPLE")
    
    # Get one random example
    sample = df[df['priority'] == priority].sample(1).iloc[0]
    
    print(f"\n📌 Subject: {sample['subject']}")
    print(f"📝 Body:\n{sample['body']}\n")

Let's read some actual emails to understand what we're working with:

--------------------------------------------------------------------------------
📧 URGENT PRIORITY EMAIL EXAMPLE

📌 Subject: EMERGENCY: Service completely down
📝 Body:
CRITICAL: Cannot login and I have a deadline in 30 minutes!

--------------------------------------------------------------------------------
📧 HIGH PRIORITY EMAIL EXAMPLE

📌 Subject: Error message when exporting
📝 Body:
I am getting an error when I try to export my data. Can you help me fix this?

--------------------------------------------------------------------------------
📧 MEDIUM PRIORITY EMAIL EXAMPLE

📌 Subject: How do I configure this?
📝 Body:
I have a question about how the billing works for multiple users.

--------------------------------------------------------------------------------
📧 LOW PRIORITY EMAIL EXAMPLE

📌 Subject: Suggestion for improvement
📝 Body:
I have a suggestion for improving the user interface.

print("Dataset Information:")
print("-"*50)
print(f"Total emails: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")

print("\n" + "="*50)
print("Data types:")
print(df.dtypes)

print("\n" + "="*50)
print("Missing values:")
print(df.isnull().sum())

Dataset Information:
--------------------------------------------------
Total emails: 1000
Number of columns: 4
Column names: ['email_id', 'subject', 'body', 'priority']

==================================================
Data types:
email_id    int64
subject       str
body          str
priority      str
dtype: object

==================================================
Missing values:
email_id    0
subject     0
body        0
priority    0
dtype: int64

print("Priority Distribution:")
print("-"*50)

# Count each priority
counts = df['priority'].value_counts()
print("\nCounts:")
print(counts)

print("\nPercentages:")
percentages = df['priority'].value_counts(normalize=True) * 100
for priority, pct in percentages.items():
    print(f"{priority:10s}: {pct:5.1f}%")

Priority Distribution:
--------------------------------------------------

Counts:
priority
Medium    400
High      250
Low       250
Urgent    100
Name: count, dtype: int64

Percentages:
Medium    :  40.0%
High      :  25.0%
Low       :  25.0%
Urgent    :  10.0%

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
colors = {'Urgent': '#e74c3c', 'High': '#e67e22', 'Medium': '#f39c12', 'Low': '#2ecc71'}
priority_counts = df['priority'].value_counts()
bars = axes[0].bar(priority_counts.index, priority_counts.values)

# Color each bar
for bar, priority in zip(bars, priority_counts.index):
    bar.set_color(colors[priority])

axes[0].set_title('Email Count by Priority', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Priority', fontsize=12)
axes[0].set_ylabel('Number of Emails', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

# Add count labels on bars
for i, (priority, count) in enumerate(priority_counts.items()):
    axes[0].text(i, count + 10, str(count), ha='center', fontweight='bold')

# Pie chart
pie_colors = [colors[p] for p in priority_counts.index]
axes[1].pie(priority_counts.values, labels=priority_counts.index, autopct='%1.1f%%',
            colors=pie_colors, startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Priority Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
fig.savefig('../assets/priority_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n⚠️  OBSERVATION: Class Imbalance Detected!")
print("   Urgent emails are only ~10% of the data")
print("   This means we need to use 'class_weight=balanced' when training")

⚠️  OBSERVATION: Class Imbalance Detected!
   Urgent emails are only ~10% of the data
   This means we need to use 'class_weight=balanced' when training

# Calculate lengths
df['subject_length'] = df['subject'].str.len()
df['body_length'] = df['body'].str.len()
df['word_count'] = df['body'].str.split().str.len()

print("Text Length Statistics:")
print("-"*50)
print(df[['subject_length', 'body_length', 'word_count']].describe())

print("\n\nBy Priority:")
print("-"*50)
print(df.groupby('priority')[['subject_length', 'body_length', 'word_count']].mean())

Text Length Statistics:
--------------------------------------------------
       subject_length  body_length  word_count
count     1000.000000  1000.000000  1000.00000
mean        23.898000    62.314000    11.33800
std          4.241885     9.871416     2.75415
min         16.000000    45.000000     7.00000
25%         22.000000    57.000000     9.00000
50%         24.000000    59.000000    11.00000
75%         27.000000    68.000000    12.00000
max         36.000000   113.000000    21.00000


By Priority:
--------------------------------------------------
          subject_length  body_length  word_count
priority                                         
High             24.0400       66.664     12.4280
Low              20.6920       59.280     10.4480
Medium           24.9025       58.965     10.5825
Urgent           27.5400       72.420     13.8600

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Box plots for each metric
for idx, (column, title) in enumerate([
    ('subject_length', 'Subject Length'),
    ('body_length', 'Body Length'),
    ('word_count', 'Word Count')
]):
    # Create box plot
    df.boxplot(column=column, by='priority', ax=axes[idx])
    axes[idx].set_title(title + ' by Priority', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Priority', fontsize=10)
    axes[idx].set_ylabel('Count', fontsize=10)
    axes[idx].get_figure().suptitle('')  # Remove automatic title

plt.tight_layout()
fig.savefig('../assets/text_length_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n📊 OBSERVATION:")
print("   Text length alone doesn't strongly distinguish priorities")
print("   We need to look at the CONTENT, not just the length!")

📊 OBSERVATION:
   Text length alone doesn't strongly distinguish priorities
   We need to look at the CONTENT, not just the length!

def count_urgency_signals(text):
    """Count various urgency indicators in text"""
    
    text_lower = text.lower()
    
    return {
        'exclamations': text.count('!'),
        'caps_words': len(re.findall(r'\b[A-Z]{2,}\b', text)),
        'urgent_words': len(re.findall(
            r'\b(urgent|asap|critical|emergency|immediately|now|help|asap)\b', 
            text_lower
        )),
        'questions': text.count('?')
    }

# Apply to all emails
print("Analyzing urgency signals in emails...")
urgency_df = df['body'].apply(count_urgency_signals).apply(pd.Series)

# Combine with original data
df_analysis = pd.concat([df, urgency_df], axis=1)

# Calculate averages by priority
urgency_by_priority = df_analysis.groupby('priority')[
    ['exclamations', 'caps_words', 'urgent_words', 'questions']
].mean()

print("\n" + "="*70)
print("AVERAGE URGENCY INDICATORS BY PRIORITY")
print("="*70)
print(urgency_by_priority)

Analyzing urgency signals in emails...

======================================================================
AVERAGE URGENCY INDICATORS BY PRIORITY
======================================================================
          exclamations  caps_words  urgent_words  questions
priority                                                   
High              0.00       0.000         0.232      0.232
Low               0.22       0.184         0.000      0.000
Medium            0.00       0.000         0.185      0.565
Urgent            1.17       0.610         1.640      0.000

urgency_by_priority.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Urgency Indicators by Priority', fontsize=14, fontweight='bold')
plt.xlabel('Priority', fontsize=12)
plt.ylabel('Average Count', fontsize=12)
plt.legend(title='Indicator', fontsize=10)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
fig.savefig('../assets/urgency_indicators.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n🎯 KEY FINDING:")
print("   ✓ Urgent emails have MORE exclamation marks")
print("   ✓ Urgent emails contain MORE urgent keywords")
print("   ✓ These will be IMPORTANT FEATURES for our model!")

🎯 KEY FINDING:
   ✓ Urgent emails have MORE exclamation marks
   ✓ Urgent emails contain MORE urgent keywords
   ✓ These will be IMPORTANT FEATURES for our model!

def get_common_words(texts, n=15):
    """Extract most common meaningful words"""
    
    # Combine all texts
    all_text = ' '.join(texts).lower()
    
    # Extract words (3+ letters, only letters)
    words = re.findall(r'\b[a-z]{3,}\b', all_text)
    
    # Remove common stop words
    stop_words = {
        'the', 'and', 'for', 'this', 'that', 'with', 'from', 
        'have', 'are', 'not', 'you', 'can', 'need', 'my'
    }
    words = [w for w in words if w not in stop_words]
    
    return Counter(words).most_common(n)

print("="*70)
print("MOST COMMON WORDS BY PRIORITY")
print("="*70)

for priority in ['Urgent', 'High', 'Medium', 'Low']:
    print(f"\n{'='*70}")
    print(f"🔍 {priority.upper()} Priority")
    print(f"{'='*70}")
    
    priority_texts = df[df['priority'] == priority]['body']
    top_words = get_common_words(priority_texts, n=10)
    
    for word, count in top_words:
        bar = '█' * (count // 3)  # Visual bar
        print(f"  {word:15s} {count:3d} {bar}")

======================================================================
MOST COMMON WORDS BY PRIORITY
======================================================================

======================================================================
🔍 URGENT Priority
======================================================================
  cannot           55 ██████████████████
  account          44 ██████████████
  critical         39 █████████████
  access           33 ███████████
  help             33 ███████████
  now              32 ██████████
  all              30 ██████████
  urgent           29 █████████
  entire           24 ████████
  service          24 ████████

======================================================================
🔍 HIGH Priority
======================================================================
  getting         105 ███████████████████████████████████
  error           105 ███████████████████████████████████
  when            105 ███████████████████████████████████
  try             105 ███████████████████████████████████
  export           58 ███████████████████
  data             58 ███████████████████
  help             58 ███████████████████
  fix              58 ███████████████████
  files            49 ████████████████
  uploading        49 ████████████████

======================================================================
🔍 MEDIUM Priority
======================================================================
  how             249 ███████████████████████████████████████████████████████████████████████████████████
  question         88 █████████████████████████████
  about            88 █████████████████████████████
  billing          88 █████████████████████████████
  works            88 █████████████████████████████
  multiple         88 █████████████████████████████
  users            88 █████████████████████████████
  explain          87 █████████████████████████████
  set              87 █████████████████████████████
  advanced         87 █████████████████████████████

======================================================================
🔍 LOW Priority
======================================================================
  suggestion      112 █████████████████████████████████████
  would            95 ███████████████████████████████
  improving        57 ███████████████████
  user             57 ███████████████████
  interface        57 ███████████████████
  consider         55 ██████████████████
  adding           55 ██████████████████
  keyboard         55 ██████████████████
  shortcuts        55 ██████████████████
  just             55 ██████████████████

print("="*70)
print("📋 EXPLORATION SUMMARY")
print("="*70)

print("\n✅ DATA QUALITY")
print("   • No missing values")
print(f"   • Total emails: {len(df)}")
print("   • All text fields populated")

print("\n⚠️  CLASS IMBALANCE DETECTED")
for priority, pct in df['priority'].value_counts(normalize=True).items():
    symbol = "🔴" if priority == "Urgent" else "🟡" if priority == "High" else "🟢"
    print(f"   {symbol} {priority:8s}: {pct*100:5.1f}%")
print("   → Action: Use class_weight='balanced' in model")

print("\n🎯 DISTINGUISHING FEATURES FOUND")
print("   1. Exclamation marks (more in urgent)")
print("   2. Urgent keywords (urgent, asap, critical, emergency)")
print("   3. CAPS words (more in urgent)")
print("   4. Vocabulary differences (clear patterns)")

print("\n📊 OBSERVATIONS")
print("   • Text length NOT very predictive")
print("   • Content/keywords ARE very predictive")
print("   • Clear patterns exist between priority levels")

print("\n✨ NEXT STEPS")
print("   1. ✅ Data exploration - COMPLETE")
print("   2. ⏭️  Feature engineering (TF-IDF + custom features)")
print("   3. ⏭️  Model training (Logistic Regression)")
print("   4. ⏭️  Evaluation (focus on Urgent recall)")

print("\n" + "="*70)

======================================================================
📋 EXPLORATION SUMMARY
======================================================================

✅ DATA QUALITY
   • No missing values
   • Total emails: 1000
   • All text fields populated

⚠️  CLASS IMBALANCE DETECTED
   🟢 Medium  :  40.0%
   🟡 High    :  25.0%
   🟢 Low     :  25.0%
   🔴 Urgent  :  10.0%
   → Action: Use class_weight='balanced' in model

🎯 DISTINGUISHING FEATURES FOUND
   1. Exclamation marks (more in urgent)
   2. Urgent keywords (urgent, asap, critical, emergency)
   3. CAPS words (more in urgent)
   4. Vocabulary differences (clear patterns)

📊 OBSERVATIONS
   • Text length NOT very predictive
   • Content/keywords ARE very predictive
   • Clear patterns exist between priority levels

✨ NEXT STEPS
   1. ✅ Data exploration - COMPLETE
   2. ⏭️  Feature engineering (TF-IDF + custom features)
   3. ⏭️  Model training (Logistic Regression)
   4. ⏭️  Evaluation (focus on Urgent recall)

======================================================================

df_analysis.to_csv('../data/processed/emails_with_features.csv', index=False)

print("✅ Saved analyzed data to: data/processed/emails_with_features.csv")
print("\n📓 Don't forget to save this notebook! (Ctrl+S or Cmd+S)")

✅ Saved analyzed data to: data/processed/emails_with_features.csv

📓 Don't forget to save this notebook! (Ctrl+S or Cmd+S)

Function to Generate Emails¶

Read some actual examples¶

Basic Information¶

Ensuring Priority Distribution¶

Visual of Priority Distribution¶

Text Analysis¶

Visualize Text Lengths¶

Count Urgency Signals¶

Visulize Urgency Indicator¶

Find the most common words¶

Summary of Findings¶

Save work¶

	email_id	subject	body	priority
0	1	Feature not working properly	I am getting an error when I try to export my ...	High
1	2	Best practices question	What is the best way to organize my projects i...	Medium
2	3	How do I configure this?	What is the best way to organize my projects i...	Medium
3	4	Feature not working properly	Files are not uploading correctly. This is aff...	High
4	5	Account hacked - need immediate help	The entire service is down for my team. We can...	Urgent