Upload 2 files
Browse files- password_health_model.pkl +3 -0
- train_model.py +118 -0
password_health_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff95dc7a1b4a09157bc826278f8329cdbe300f84eab1d8a5e5f19de741460eb8
|
| 3 |
+
size 8468881
|
train_model.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Script to train and save the password health model.
|
| 2 |
+
# Loads labeled password dataset, extracts features, trains a RandomForest classifier, and saves the model.
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import joblib
|
| 9 |
+
import random
|
| 10 |
+
import string
|
| 11 |
+
import re
|
| 12 |
+
import math
|
| 13 |
+
import zlib
|
| 14 |
+
from collections import Counter
|
| 15 |
+
|
| 16 |
+
# Load the weak password list into memory for training data.
|
| 17 |
+
with open("weak_passwords.txt", "r", encoding="utf-8", errors="ignore") as f:
|
| 18 |
+
rockyou_passwords = [line.strip() for line in f]
|
| 19 |
+
|
| 20 |
+
# Generate weak, medium, and strong password samples for training.
|
| 21 |
+
# Weak: real breached passwords; Medium/Strong: synthetic passwords with varying complexity.
|
| 22 |
+
|
| 23 |
+
# 100,000 weak passwords from weak_passwords.txt
|
| 24 |
+
weak_passwords = random.sample(rockyou_passwords, 100000)
|
| 25 |
+
|
| 26 |
+
# Generate 100,000 medium passwords: random length between 8–12, letters, digits, and occasionally symbols
|
| 27 |
+
medium_passwords = []
|
| 28 |
+
for _ in range(100000):
|
| 29 |
+
length = random.randint(8, 12)
|
| 30 |
+
if random.random() < 0.2:
|
| 31 |
+
allowed_chars = string.ascii_letters + string.digits + "!@#$%"
|
| 32 |
+
else:
|
| 33 |
+
allowed_chars = string.ascii_letters + string.digits
|
| 34 |
+
medium_passwords.append(''.join(random.choices(allowed_chars, k=length)))
|
| 35 |
+
|
| 36 |
+
# Generate 100,000 strong passwords: random length between 12–16, letters, digits, and symbols
|
| 37 |
+
strong_passwords = [
|
| 38 |
+
''.join(random.choices(string.ascii_letters + string.digits + "!@#$%", k=random.randint(12,16)))
|
| 39 |
+
for _ in range(100000)
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# Combine all passwords and assign labels: 0 (weak), 1 (medium), 2 (strong)
|
| 43 |
+
data = weak_passwords + medium_passwords + strong_passwords
|
| 44 |
+
labels = [0] * 100000 + [1] * 100000 + [2] * 100000
|
| 45 |
+
|
| 46 |
+
# Define function to extract password features for ML classification.
|
| 47 |
+
def password_features(password: str) -> dict:
|
| 48 |
+
"""
|
| 49 |
+
Extracts features from a password for strength classification.
|
| 50 |
+
Returns a dictionary with password features.
|
| 51 |
+
"""
|
| 52 |
+
features = {
|
| 53 |
+
"length": len(password),
|
| 54 |
+
"entropy": math.log2(len(set(password)) ** len(password)) if password else 0,
|
| 55 |
+
"has_upper": int(bool(re.search(r"[A-Z]", password))),
|
| 56 |
+
"has_symbol": int(bool(re.search(r"[^A-Za-z0-9]", password))),
|
| 57 |
+
"has_leet": int(any(c in "@3!0" for c in password)),
|
| 58 |
+
"repetition": int(bool(re.search(r"(.)\1{2,}", password))),
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Proportion of digits
|
| 62 |
+
num_digits = sum(1 for c in password if c.isdigit())
|
| 63 |
+
features["digit_ratio"] = num_digits / len(password) if password else 0
|
| 64 |
+
|
| 65 |
+
# Unique character ratio
|
| 66 |
+
features["unique_ratio"] = len(set(password)) / len(password) if password else 0
|
| 67 |
+
|
| 68 |
+
# Bigram entropy
|
| 69 |
+
if len(password) >= 2:
|
| 70 |
+
bigrams = [password[i:i+2] for i in range(len(password)-1)]
|
| 71 |
+
bigram_counts = Counter(bigrams)
|
| 72 |
+
total_bigrams = sum(bigram_counts.values())
|
| 73 |
+
features["bigram_entropy"] = -sum(
|
| 74 |
+
(count / total_bigrams) * math.log2(count / total_bigrams)
|
| 75 |
+
for count in bigram_counts.values()
|
| 76 |
+
) if total_bigrams else 0
|
| 77 |
+
else:
|
| 78 |
+
features["bigram_entropy"] = 0
|
| 79 |
+
|
| 80 |
+
# Compression ratio
|
| 81 |
+
features["compression_ratio"] = (
|
| 82 |
+
len(zlib.compress(password.encode())) / len(password)
|
| 83 |
+
if password else 1.0
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return features
|
| 87 |
+
|
| 88 |
+
# Extract features for all passwords and build the DataFrame.
|
| 89 |
+
df = pd.DataFrame([password_features(pw) for pw in data])
|
| 90 |
+
|
| 91 |
+
# Add breached status and labels to the DataFrame.
|
| 92 |
+
df["hibp_breached"] = [1 if label == 0 else 0 for label in labels]
|
| 93 |
+
df["label"] = labels
|
| 94 |
+
|
| 95 |
+
# Prepare features (X) and target (y) for training.
|
| 96 |
+
X = df.drop("label", axis=1)
|
| 97 |
+
y = df["label"]
|
| 98 |
+
|
| 99 |
+
# Split into training and test sets for evaluation.
|
| 100 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 101 |
+
|
| 102 |
+
# Initialize and train the Random Forest Classifier with tuned parameters.
|
| 103 |
+
model = RandomForestClassifier(
|
| 104 |
+
n_estimators=200,
|
| 105 |
+
max_depth=20, # Limit depth to prevent overfitting
|
| 106 |
+
min_samples_split=5, # Require at least 5 samples to split
|
| 107 |
+
random_state=42
|
| 108 |
+
)
|
| 109 |
+
model.fit(X_train, y_train)
|
| 110 |
+
|
| 111 |
+
# Evaluate model accuracy on the test set.
|
| 112 |
+
y_pred = model.predict(X_test)
|
| 113 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 114 |
+
print(f"Model accuracy: {accuracy:.2%}")
|
| 115 |
+
|
| 116 |
+
# Serialize trained model for use in the application.
|
| 117 |
+
joblib.dump(model, "password_health_model.pkl")
|
| 118 |
+
print("Model saved as 'password_health_model.pkl'")
|