InΒ [Β ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix
import re
from textblob import TextBlob
import joblib
import warnings
warnings.filterwarnings('ignore')

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("βœ… Libraries imported successfully!")

Load Data SetΒΆ

InΒ [Β ]:
print("Loading data...")
df = pd.read_csv('../data/processed/emails_with_features.csv')

print(f"βœ… Loaded {len(df)} emails")
print(f"\nColumns: {list(df.columns)}")
print(f"\nShape: {df.shape}")

# Quick look
df.head(3)

CLean and Prepare textΒΆ

InΒ [Β ]:
def clean_text(text):
    """
    Clean text for feature extraction.
    
    Note: We keep some "dirty" elements like !!! and CAPS
    because they're actually useful signals for urgency!
    """
    if pd.isna(text):
        return ""
    
    # Convert to string (just in case)
    text = str(text)
    
    # Remove URLs (not useful for priority)
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove email addresses (privacy)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning
print("Cleaning text...")
df['subject_clean'] = df['subject'].apply(clean_text)
df['body_clean'] = df['body'].apply(clean_text)

# Combine subject and body for full text
df['full_text'] = df['subject_clean'] + ' ' + df['body_clean']

print("βœ… Text cleaned!")
print(f"\nExample cleaned text:")
print(df['full_text'].iloc[0])

TF-IDF exampleΒΆ

InΒ [Β ]:
print("-"*70)
print("UNDERSTANDING TF-IDF (Term Frequency - Inverse Document Frequency)")
print("-"*70)

# Simple example
sample_emails = [
    "urgent help needed now",
    "feature request for future",
    "urgent critical issue",
]

print("\nπŸ“§ Sample emails:")
for i, email in enumerate(sample_emails, 1):
    print(f"   {i}. {email}")

# Create TF-IDF vectorizer (simple version)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sample_emails)

# Show the vocabulary
print(f"\nπŸ“š Vocabulary (unique words found):")
print(f"   {vectorizer.get_feature_names_out()}")

print(f"\nπŸ”’ TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"   (3 emails Γ— {len(vectorizer.get_feature_names_out())} words)")

# Show matrix as DataFrame for readability
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=['Email 1', 'Email 2', 'Email 3']
)

print(f"\nπŸ“Š TF-IDF Values (higher = more important in that email):")
print(tfidf_df)

print("\nπŸ’‘ KEY INSIGHTS:")
print("   β€’ 'urgent' has HIGH value in emails 1 & 3 (appears there)")
print("   β€’ 'urgent' has ZERO in email 2 (doesn't appear)")
print("   β€’ Words appearing in ALL emails get LOWER scores (less distinctive)")
print("   β€’ Rare words get HIGHER scores (more distinctive)")

Extra TF-IDF FeaturesΒΆ

InΒ [Β ]:
print("Extracting TF-IDF features...")
print("-"*70)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=300,        # Keep top 300 most important words
    min_df=2,                # Word must appear in at least 2 emails
    max_df=0.8,              # Ignore words in more than 80% of emails
    ngram_range=(1, 2),      # Use single words AND word pairs
    stop_words='english',    # Remove common words (the, and, is, etc.)
    lowercase=True           # Convert to lowercase for consistency
)

# Fit on training data and transform
tfidf_features = tfidf_vectorizer.fit_transform(df['full_text'])

print(f"βœ… TF-IDF features created!")
print(f"\nFeature matrix shape: {tfidf_features.shape}")
print(f"   ({tfidf_features.shape[0]} emails Γ— {tfidf_features.shape[1]} features)")

# Show feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nπŸ“š Sample feature names (words and word pairs):")
print(f"   {list(feature_names[:20])}")

print(f"\nπŸ’Ύ Memory used: {tfidf_features.data.nbytes / 1024:.2f} KB (sparse matrix!)")

Analyze Most Important WordsΒΆ

InΒ [Β ]:
def get_top_tfidf_words(tfidf_matrix, feature_names, priority_mask, top_n=10):
    """Get words with highest average TF-IDF score for a priority class"""
    
    # Get TF-IDF values for this priority
    priority_tfidf = tfidf_matrix[priority_mask.values]
    
    # Calculate mean TF-IDF score for each word
    mean_tfidf = np.asarray(priority_tfidf.mean(axis=0)).flatten()
    
    # Get top words
    top_indices = mean_tfidf.argsort()[-top_n:][::-1]
    top_words = [(feature_names[i], mean_tfidf[i]) for i in top_indices]
    
    return top_words

print("-"*70)
print("MOST IMPORTANT WORDS BY PRIORITY (TF-IDF Analysis)")
print("-"*70)

for priority in ['Urgent', 'High', 'Medium', 'Low']:
    print(f"\nπŸ” {priority.upper()} Priority:")
    print("-"*70)
    
    priority_mask = df['priority'] == priority
    top_words = get_top_tfidf_words(tfidf_features, feature_names, priority_mask, top_n=15)
    
    for word, score in top_words:
        bar = 'β–ˆ' * int(score * 100)
        print(f"   {word:25s} {score:.4f} {bar}")

print("\n" + "-"*70)
print("πŸ’‘ OBSERVATION:")
print("   β€’ Urgent: 'urgent', 'asap', 'critical', 'locked', 'cannot'")
print("   β€’ Low: 'suggestion', 'feature', 'would', 'nice'")
print("   β€’ Clear vocabulary differences!")
print("-"*70)

Create Custom Features

InΒ [Β ]:
def extract_custom_features(df):
    """
    Extract hand-crafted features based on our exploration.
    These capture signals that TF-IDF might miss!
    """
    
    print("Extracting custom features...")
    
    features = pd.DataFrame()
    
    # 1. EXCLAMATION MARKS (urgent emails use more)
    features['exclamation_count'] = df['body'].str.count('!')
    
    # 2. QUESTION MARKS (confusion/issues)
    features['question_count'] = df['body'].str.count('\?')
    
    # 3. CAPS WORDS (shouting = urgency)
    features['caps_word_count'] = df['body'].apply(
        lambda x: len(re.findall(r'\b[A-Z]{2,}\b', str(x)))
    )
    
    # 4. URGENT KEYWORDS (explicit urgency)
    urgent_pattern = r'\b(urgent|asap|critical|emergency|immediately|now|help)\b'
    features['urgent_keyword_count'] = df['full_text'].str.lower().str.count(urgent_pattern)
    
    # 5. PROBLEM KEYWORDS (indicates issues)
    problem_pattern = r'\b(cannot|can\'t|unable|broken|error|failed|issue|problem)\b'
    features['problem_keyword_count'] = df['full_text'].str.lower().str.count(problem_pattern)
    
    # 6. SENTIMENT (negative = problems)
    print("   Analyzing sentiment...")
    features['sentiment_polarity'] = df['body_clean'].apply(
        lambda x: TextBlob(str(x)).sentiment.polarity
    )
    
    # 7. TEXT LENGTH FEATURES
    features['word_count'] = df['body_clean'].str.split().str.len()
    features['char_count'] = df['body_clean'].str.len()
    
    # 8. SUBJECT LENGTH (short subjects might be urgent)
    features['subject_length'] = df['subject_clean'].str.len()
    
    # 9. ALL CAPS SUBJECT (indicates urgency)
    features['subject_all_caps'] = df['subject'].apply(
        lambda x: int(str(x).isupper()) if len(str(x)) > 3 else 0
    )
    
    print(f"βœ… Extracted {len(features.columns)} custom features!")
    
    return features

# Extract features
custom_features = extract_custom_features(df)

print(f"\nπŸ“Š Custom features created:")
print(custom_features.head())

print(f"\nπŸ“ˆ Custom feature statistics by priority:")
print(custom_features.groupby(df['priority']).mean())

Visualize Custom FeaturesΒΆ

InΒ [Β ]:
key_features = [
    'exclamation_count',
    'urgent_keyword_count',
    'problem_keyword_count',
    'sentiment_polarity'
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'Priority': df['priority'],
        'Value': custom_features[feature]
    })
    
    # Box plot
    plot_df.boxplot(column='Value', by='Priority', ax=axes[idx])
    axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Priority')
    axes[idx].set_ylabel('Value')
    axes[idx].get_figure().suptitle('')  # Remove automatic title

plt.tight_layout()
fig.savefig('../assets/key_features_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nπŸ“Š VISUALIZATION INSIGHTS:")
print("   β€’ Exclamation marks: Higher in urgent emails")
print("   β€’ Urgent keywords: Clear difference between priorities")
print("   β€’ Sentiment: More negative in urgent/high priority")

Combine All FeaturesΒΆ

InΒ [Β ]:
print("Combining TF-IDF and custom features...")
print("-"*70)

# Convert custom features to sparse matrix for efficient combination
custom_features_sparse = csr_matrix(custom_features.values)

# Combine: [TF-IDF features | Custom features]
X = hstack([tfidf_features, custom_features_sparse])

# Target variable
y = df['priority']

print(f"βœ… Combined feature matrix created!")
print(f"\nFinal feature matrix shape: {X.shape}")
print(f"   β€’ TF-IDF features: {tfidf_features.shape[1]}")
print(f"   β€’ Custom features: {custom_features.shape[1]}")
print(f"   β€’ Total features: {X.shape[1]}")

print(f"\n🎯 Target variable (priority):")
print(y.value_counts())

Train Test SplitΒΆ

InΒ [Β ]:
print("Splitting data into train and test sets...")
print("="*70)

# Split with stratification (keeps class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y,
    test_size=0.2,           # 20% for testing
    random_state=42,         # For reproducibility
    stratify=y               # Keep same class distribution in both sets
)

print(f"βœ… Data split complete!")
print(f"\nπŸ“Š Training set:")
print(f"   β€’ Samples: {X_train.shape[0]}")
print(f"   β€’ Features: {X_train.shape[1]}")
print(f"   β€’ Priority distribution:")
for priority, count in y_train.value_counts().items():
    percentage = count / len(y_train) * 100
    print(f"     - {priority:8s}: {count:3d} ({percentage:5.1f}%)")

print(f"\nπŸ“Š Test set:")
print(f"   β€’ Samples: {X_test.shape[0]}")
print(f"   β€’ Features: {X_test.shape[1]}")
print(f"   β€’ Priority distribution:")
for priority, count in y_test.value_counts().items():
    percentage = count / len(y_test) * 100
    print(f"     - {priority:8s}: {count:3d} ({percentage:5.1f}%)")

print("\nπŸ’‘ NOTE: Class distributions match! (thanks to stratify=y)")

Create Feature Names ListΒΆ

InΒ [Β ]:
all_feature_names = (
    list(tfidf_vectorizer.get_feature_names_out()) + 
    list(custom_features.columns)
)

print(f"Total features: {len(all_feature_names)}")
print(f"\nπŸ”€ Sample TF-IDF features (first 10):")
print(all_feature_names[:10])

print(f"\n✨ Custom features (all):")
print(all_feature_names[-len(custom_features.columns):])

print("\nπŸ’‘ Why we need feature names:")
print("   β€’ To understand which features are most important")
print("   β€’ To explain model predictions")
print("   β€’ To debug if model isn't working well")

Save EverythingΒΆ

InΒ [Β ]:
import joblib
import json
from datetime import datetime

print("Saving feature engineering artifacts...")
print("-"*70)

# 1. Save the TF-IDF vectorizer (need this for new emails!)
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.pkl')
print("βœ… Saved TF-IDF vectorizer")

# 2. Save feature names
with open('../models/feature_names.json', 'w') as f:
    json.dump(all_feature_names, f, indent=2)
print("βœ… Saved feature names")

# 3. Save train-test splits
joblib.dump({
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}, '../models/train_test_data.pkl')
print("βœ… Saved train-test data")

# 4. Save metadata
metadata = {
    'created_date': datetime.now().isoformat(),
    'n_samples': len(df),
    'n_features': X.shape[1],
    'n_tfidf_features': tfidf_features.shape[1],
    'n_custom_features': custom_features.shape[1],
    'train_samples': X_train.shape[0],
    'test_samples': X_test.shape[0],
    'tfidf_params': {
        'max_features': 300,
        'min_df': 2,
        'max_df': 0.8,
        'ngram_range': '(1, 2)',
        'stop_words': 'english'
    },
    'custom_features': list(custom_features.columns)
}

with open('../models/feature_engineering_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("βœ… Saved metadata")

# 5. Save processed features DataFrame (optional, for analysis)
feature_df = pd.DataFrame(
    X.toarray(),
    columns=all_feature_names
)
feature_df['priority'] = y.values
feature_df.to_csv('../data/processed/features_complete.csv', index=False)
print("βœ… Saved complete feature DataFrame")

print("\n" + "-"*70)
print("πŸŽ‰ FEATURE ENGINEERING COMPLETE!")
print("-"*70)
print("\nπŸ“¦ Saved files:")
print("   β€’ models/tfidf_vectorizer.pkl")
print("   β€’ models/feature_names.json")
print("   β€’ models/train_test_data.pkl")
print("   β€’ models/feature_engineering_metadata.json")
print("   β€’ data/processed/features_complete.csv")

Summary and Next StepsΒΆ

InΒ [Β ]:
print("-"*70)
print("πŸ“‹ FEATURE ENGINEERING SUMMARY")
print("-"*70)

print("\nβœ… COMPLETED TASKS:")
print("   1. Loaded and cleaned email text")
print("   2. Created TF-IDF features (300 features)")
print("   3. Extracted custom urgency features (9 features)")
print("   4. Combined into single feature matrix (309 features)")
print("   5. Split into train (80%) and test (20%) sets")
print("   6. Saved all artifacts for model training")

print("\nπŸ“Š FEATURE BREAKDOWN:")
print(f"   β€’ Total features: {X.shape[1]}")
print(f"   β€’ TF-IDF (text content): {tfidf_features.shape[1]}")
print(f"   β€’ Custom (urgency signals): {custom_features.shape[1]}")

print("\n🎯 KEY FEATURES IDENTIFIED:")
print("   TF-IDF captured:")
print("      β€’ Urgent: 'urgent', 'asap', 'critical', 'locked'")
print("      β€’ Low: 'suggestion', 'feature', 'would be nice'")
print("   ")
print("   Custom features captured:")
print("      β€’ Exclamation marks (!!!)")
print("      β€’ Urgent keywords count")
print("      β€’ Problem keywords (cannot, error, broken)")
print("      β€’ Sentiment polarity")

print("\nπŸ’‘ WHY THIS MATTERS:")
print("   β€’ ML models need numbers, not text")
print("   β€’ TF-IDF captures word importance")
print("   β€’ Custom features capture domain knowledge")
print("   β€’ Combination gives model the best chance")

print("\n✨ NEXT STEPS:")
print("   1. βœ… Data exploration - COMPLETE (Notebook 1)")
print("   2. βœ… Feature engineering - COMPLETE (Notebook 2)")
print("   3. ⏭️  Model training - NEXT (Notebook 3)")
print("   4. ⏭️  Model evaluation")
print("   5. ⏭️  Build API")

print("\n" + "-"*70)
print("Ready to train your model! πŸš€")
print("-"*70)

Verify for these filesΒΆ

models/
β”œβ”€β”€ tfidf_vectorizer.pkl              ← TF-IDF transformer
β”œβ”€β”€ feature_names.json                ← List of all features
β”œβ”€β”€ train_test_data.pkl               ← Split data
└── feature_engineering_metadata.json ← Info about features

data/processed/
└── features_complete.csv             ← Full feature matrix