InΒ [Β ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix
import re
from textblob import TextBlob
import joblib
import warnings
warnings.filterwarnings('ignore')
# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print("β
Libraries imported successfully!")
Load Data SetΒΆ
InΒ [Β ]:
print("Loading data...")
df = pd.read_csv('../data/processed/emails_with_features.csv')
print(f"β
Loaded {len(df)} emails")
print(f"\nColumns: {list(df.columns)}")
print(f"\nShape: {df.shape}")
# Quick look
df.head(3)
CLean and Prepare textΒΆ
InΒ [Β ]:
def clean_text(text):
"""
Clean text for feature extraction.
Note: We keep some "dirty" elements like !!! and CAPS
because they're actually useful signals for urgency!
"""
if pd.isna(text):
return ""
# Convert to string (just in case)
text = str(text)
# Remove URLs (not useful for priority)
text = re.sub(r'http\S+|www\S+', '', text)
# Remove email addresses (privacy)
text = re.sub(r'\S+@\S+', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
# Apply cleaning
print("Cleaning text...")
df['subject_clean'] = df['subject'].apply(clean_text)
df['body_clean'] = df['body'].apply(clean_text)
# Combine subject and body for full text
df['full_text'] = df['subject_clean'] + ' ' + df['body_clean']
print("β
Text cleaned!")
print(f"\nExample cleaned text:")
print(df['full_text'].iloc[0])
TF-IDF exampleΒΆ
InΒ [Β ]:
print("-"*70)
print("UNDERSTANDING TF-IDF (Term Frequency - Inverse Document Frequency)")
print("-"*70)
# Simple example
sample_emails = [
"urgent help needed now",
"feature request for future",
"urgent critical issue",
]
print("\nπ§ Sample emails:")
for i, email in enumerate(sample_emails, 1):
print(f" {i}. {email}")
# Create TF-IDF vectorizer (simple version)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sample_emails)
# Show the vocabulary
print(f"\nπ Vocabulary (unique words found):")
print(f" {vectorizer.get_feature_names_out()}")
print(f"\nπ’ TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f" (3 emails Γ {len(vectorizer.get_feature_names_out())} words)")
# Show matrix as DataFrame for readability
tfidf_df = pd.DataFrame(
tfidf_matrix.toarray(),
columns=vectorizer.get_feature_names_out(),
index=['Email 1', 'Email 2', 'Email 3']
)
print(f"\nπ TF-IDF Values (higher = more important in that email):")
print(tfidf_df)
print("\nπ‘ KEY INSIGHTS:")
print(" β’ 'urgent' has HIGH value in emails 1 & 3 (appears there)")
print(" β’ 'urgent' has ZERO in email 2 (doesn't appear)")
print(" β’ Words appearing in ALL emails get LOWER scores (less distinctive)")
print(" β’ Rare words get HIGHER scores (more distinctive)")
Extra TF-IDF FeaturesΒΆ
InΒ [Β ]:
print("Extracting TF-IDF features...")
print("-"*70)
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
max_features=300, # Keep top 300 most important words
min_df=2, # Word must appear in at least 2 emails
max_df=0.8, # Ignore words in more than 80% of emails
ngram_range=(1, 2), # Use single words AND word pairs
stop_words='english', # Remove common words (the, and, is, etc.)
lowercase=True # Convert to lowercase for consistency
)
# Fit on training data and transform
tfidf_features = tfidf_vectorizer.fit_transform(df['full_text'])
print(f"β
TF-IDF features created!")
print(f"\nFeature matrix shape: {tfidf_features.shape}")
print(f" ({tfidf_features.shape[0]} emails Γ {tfidf_features.shape[1]} features)")
# Show feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nπ Sample feature names (words and word pairs):")
print(f" {list(feature_names[:20])}")
print(f"\nπΎ Memory used: {tfidf_features.data.nbytes / 1024:.2f} KB (sparse matrix!)")
Analyze Most Important WordsΒΆ
InΒ [Β ]:
def get_top_tfidf_words(tfidf_matrix, feature_names, priority_mask, top_n=10):
"""Get words with highest average TF-IDF score for a priority class"""
# Get TF-IDF values for this priority
priority_tfidf = tfidf_matrix[priority_mask.values]
# Calculate mean TF-IDF score for each word
mean_tfidf = np.asarray(priority_tfidf.mean(axis=0)).flatten()
# Get top words
top_indices = mean_tfidf.argsort()[-top_n:][::-1]
top_words = [(feature_names[i], mean_tfidf[i]) for i in top_indices]
return top_words
print("-"*70)
print("MOST IMPORTANT WORDS BY PRIORITY (TF-IDF Analysis)")
print("-"*70)
for priority in ['Urgent', 'High', 'Medium', 'Low']:
print(f"\nπ {priority.upper()} Priority:")
print("-"*70)
priority_mask = df['priority'] == priority
top_words = get_top_tfidf_words(tfidf_features, feature_names, priority_mask, top_n=15)
for word, score in top_words:
bar = 'β' * int(score * 100)
print(f" {word:25s} {score:.4f} {bar}")
print("\n" + "-"*70)
print("π‘ OBSERVATION:")
print(" β’ Urgent: 'urgent', 'asap', 'critical', 'locked', 'cannot'")
print(" β’ Low: 'suggestion', 'feature', 'would', 'nice'")
print(" β’ Clear vocabulary differences!")
print("-"*70)
Create Custom Features
InΒ [Β ]:
def extract_custom_features(df):
"""
Extract hand-crafted features based on our exploration.
These capture signals that TF-IDF might miss!
"""
print("Extracting custom features...")
features = pd.DataFrame()
# 1. EXCLAMATION MARKS (urgent emails use more)
features['exclamation_count'] = df['body'].str.count('!')
# 2. QUESTION MARKS (confusion/issues)
features['question_count'] = df['body'].str.count('\?')
# 3. CAPS WORDS (shouting = urgency)
features['caps_word_count'] = df['body'].apply(
lambda x: len(re.findall(r'\b[A-Z]{2,}\b', str(x)))
)
# 4. URGENT KEYWORDS (explicit urgency)
urgent_pattern = r'\b(urgent|asap|critical|emergency|immediately|now|help)\b'
features['urgent_keyword_count'] = df['full_text'].str.lower().str.count(urgent_pattern)
# 5. PROBLEM KEYWORDS (indicates issues)
problem_pattern = r'\b(cannot|can\'t|unable|broken|error|failed|issue|problem)\b'
features['problem_keyword_count'] = df['full_text'].str.lower().str.count(problem_pattern)
# 6. SENTIMENT (negative = problems)
print(" Analyzing sentiment...")
features['sentiment_polarity'] = df['body_clean'].apply(
lambda x: TextBlob(str(x)).sentiment.polarity
)
# 7. TEXT LENGTH FEATURES
features['word_count'] = df['body_clean'].str.split().str.len()
features['char_count'] = df['body_clean'].str.len()
# 8. SUBJECT LENGTH (short subjects might be urgent)
features['subject_length'] = df['subject_clean'].str.len()
# 9. ALL CAPS SUBJECT (indicates urgency)
features['subject_all_caps'] = df['subject'].apply(
lambda x: int(str(x).isupper()) if len(str(x)) > 3 else 0
)
print(f"β
Extracted {len(features.columns)} custom features!")
return features
# Extract features
custom_features = extract_custom_features(df)
print(f"\nπ Custom features created:")
print(custom_features.head())
print(f"\nπ Custom feature statistics by priority:")
print(custom_features.groupby(df['priority']).mean())
Visualize Custom FeaturesΒΆ
InΒ [Β ]:
key_features = [
'exclamation_count',
'urgent_keyword_count',
'problem_keyword_count',
'sentiment_polarity'
]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for idx, feature in enumerate(key_features):
# Create DataFrame for plotting
plot_df = pd.DataFrame({
'Priority': df['priority'],
'Value': custom_features[feature]
})
# Box plot
plot_df.boxplot(column='Value', by='Priority', ax=axes[idx])
axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
axes[idx].set_xlabel('Priority')
axes[idx].set_ylabel('Value')
axes[idx].get_figure().suptitle('') # Remove automatic title
plt.tight_layout()
fig.savefig('../assets/key_features_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nπ VISUALIZATION INSIGHTS:")
print(" β’ Exclamation marks: Higher in urgent emails")
print(" β’ Urgent keywords: Clear difference between priorities")
print(" β’ Sentiment: More negative in urgent/high priority")
Combine All FeaturesΒΆ
InΒ [Β ]:
print("Combining TF-IDF and custom features...")
print("-"*70)
# Convert custom features to sparse matrix for efficient combination
custom_features_sparse = csr_matrix(custom_features.values)
# Combine: [TF-IDF features | Custom features]
X = hstack([tfidf_features, custom_features_sparse])
# Target variable
y = df['priority']
print(f"β
Combined feature matrix created!")
print(f"\nFinal feature matrix shape: {X.shape}")
print(f" β’ TF-IDF features: {tfidf_features.shape[1]}")
print(f" β’ Custom features: {custom_features.shape[1]}")
print(f" β’ Total features: {X.shape[1]}")
print(f"\nπ― Target variable (priority):")
print(y.value_counts())
Train Test SplitΒΆ
InΒ [Β ]:
print("Splitting data into train and test sets...")
print("="*70)
# Split with stratification (keeps class balance)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2, # 20% for testing
random_state=42, # For reproducibility
stratify=y # Keep same class distribution in both sets
)
print(f"β
Data split complete!")
print(f"\nπ Training set:")
print(f" β’ Samples: {X_train.shape[0]}")
print(f" β’ Features: {X_train.shape[1]}")
print(f" β’ Priority distribution:")
for priority, count in y_train.value_counts().items():
percentage = count / len(y_train) * 100
print(f" - {priority:8s}: {count:3d} ({percentage:5.1f}%)")
print(f"\nπ Test set:")
print(f" β’ Samples: {X_test.shape[0]}")
print(f" β’ Features: {X_test.shape[1]}")
print(f" β’ Priority distribution:")
for priority, count in y_test.value_counts().items():
percentage = count / len(y_test) * 100
print(f" - {priority:8s}: {count:3d} ({percentage:5.1f}%)")
print("\nπ‘ NOTE: Class distributions match! (thanks to stratify=y)")
Create Feature Names ListΒΆ
InΒ [Β ]:
all_feature_names = (
list(tfidf_vectorizer.get_feature_names_out()) +
list(custom_features.columns)
)
print(f"Total features: {len(all_feature_names)}")
print(f"\nπ€ Sample TF-IDF features (first 10):")
print(all_feature_names[:10])
print(f"\n⨠Custom features (all):")
print(all_feature_names[-len(custom_features.columns):])
print("\nπ‘ Why we need feature names:")
print(" β’ To understand which features are most important")
print(" β’ To explain model predictions")
print(" β’ To debug if model isn't working well")
Save EverythingΒΆ
InΒ [Β ]:
import joblib
import json
from datetime import datetime
print("Saving feature engineering artifacts...")
print("-"*70)
# 1. Save the TF-IDF vectorizer (need this for new emails!)
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.pkl')
print("β
Saved TF-IDF vectorizer")
# 2. Save feature names
with open('../models/feature_names.json', 'w') as f:
json.dump(all_feature_names, f, indent=2)
print("β
Saved feature names")
# 3. Save train-test splits
joblib.dump({
'X_train': X_train,
'X_test': X_test,
'y_train': y_train,
'y_test': y_test
}, '../models/train_test_data.pkl')
print("β
Saved train-test data")
# 4. Save metadata
metadata = {
'created_date': datetime.now().isoformat(),
'n_samples': len(df),
'n_features': X.shape[1],
'n_tfidf_features': tfidf_features.shape[1],
'n_custom_features': custom_features.shape[1],
'train_samples': X_train.shape[0],
'test_samples': X_test.shape[0],
'tfidf_params': {
'max_features': 300,
'min_df': 2,
'max_df': 0.8,
'ngram_range': '(1, 2)',
'stop_words': 'english'
},
'custom_features': list(custom_features.columns)
}
with open('../models/feature_engineering_metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print("β
Saved metadata")
# 5. Save processed features DataFrame (optional, for analysis)
feature_df = pd.DataFrame(
X.toarray(),
columns=all_feature_names
)
feature_df['priority'] = y.values
feature_df.to_csv('../data/processed/features_complete.csv', index=False)
print("β
Saved complete feature DataFrame")
print("\n" + "-"*70)
print("π FEATURE ENGINEERING COMPLETE!")
print("-"*70)
print("\nπ¦ Saved files:")
print(" β’ models/tfidf_vectorizer.pkl")
print(" β’ models/feature_names.json")
print(" β’ models/train_test_data.pkl")
print(" β’ models/feature_engineering_metadata.json")
print(" β’ data/processed/features_complete.csv")
Summary and Next StepsΒΆ
InΒ [Β ]:
print("-"*70)
print("π FEATURE ENGINEERING SUMMARY")
print("-"*70)
print("\nβ
COMPLETED TASKS:")
print(" 1. Loaded and cleaned email text")
print(" 2. Created TF-IDF features (300 features)")
print(" 3. Extracted custom urgency features (9 features)")
print(" 4. Combined into single feature matrix (309 features)")
print(" 5. Split into train (80%) and test (20%) sets")
print(" 6. Saved all artifacts for model training")
print("\nπ FEATURE BREAKDOWN:")
print(f" β’ Total features: {X.shape[1]}")
print(f" β’ TF-IDF (text content): {tfidf_features.shape[1]}")
print(f" β’ Custom (urgency signals): {custom_features.shape[1]}")
print("\nπ― KEY FEATURES IDENTIFIED:")
print(" TF-IDF captured:")
print(" β’ Urgent: 'urgent', 'asap', 'critical', 'locked'")
print(" β’ Low: 'suggestion', 'feature', 'would be nice'")
print(" ")
print(" Custom features captured:")
print(" β’ Exclamation marks (!!!)")
print(" β’ Urgent keywords count")
print(" β’ Problem keywords (cannot, error, broken)")
print(" β’ Sentiment polarity")
print("\nπ‘ WHY THIS MATTERS:")
print(" β’ ML models need numbers, not text")
print(" β’ TF-IDF captures word importance")
print(" β’ Custom features capture domain knowledge")
print(" β’ Combination gives model the best chance")
print("\n⨠NEXT STEPS:")
print(" 1. β
Data exploration - COMPLETE (Notebook 1)")
print(" 2. β
Feature engineering - COMPLETE (Notebook 2)")
print(" 3. βοΈ Model training - NEXT (Notebook 3)")
print(" 4. βοΈ Model evaluation")
print(" 5. βοΈ Build API")
print("\n" + "-"*70)
print("Ready to train your model! π")
print("-"*70)
Verify for these filesΒΆ
models/
βββ tfidf_vectorizer.pkl β TF-IDF transformer
βββ feature_names.json β List of all features
βββ train_test_data.pkl β Split data
βββ feature_engineering_metadata.json β Info about features
data/processed/
βββ features_complete.csv β Full feature matrix