naailkhokhar commited on
Commit
ce40857
·
verified ·
1 Parent(s): 85e932c

Upload 2 files

Browse files
Files changed (2) hide show
  1. password_health_model.pkl +3 -0
  2. train_model.py +118 -0
password_health_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff95dc7a1b4a09157bc826278f8329cdbe300f84eab1d8a5e5f19de741460eb8
3
+ size 8468881
train_model.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Script to train and save the password health model.
2
+ # Loads labeled password dataset, extracts features, trains a RandomForest classifier, and saves the model.
3
+
4
+ import pandas as pd
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import accuracy_score
8
+ import joblib
9
+ import random
10
+ import string
11
+ import re
12
+ import math
13
+ import zlib
14
+ from collections import Counter
15
+
16
+ # Load the weak password list into memory for training data.
17
+ with open("weak_passwords.txt", "r", encoding="utf-8", errors="ignore") as f:
18
+ rockyou_passwords = [line.strip() for line in f]
19
+
20
+ # Generate weak, medium, and strong password samples for training.
21
+ # Weak: real breached passwords; Medium/Strong: synthetic passwords with varying complexity.
22
+
23
+ # 100,000 weak passwords from weak_passwords.txt
24
+ weak_passwords = random.sample(rockyou_passwords, 100000)
25
+
26
+ # Generate 100,000 medium passwords: random length between 8–12, letters, digits, and occasionally symbols
27
+ medium_passwords = []
28
+ for _ in range(100000):
29
+ length = random.randint(8, 12)
30
+ if random.random() < 0.2:
31
+ allowed_chars = string.ascii_letters + string.digits + "!@#$%"
32
+ else:
33
+ allowed_chars = string.ascii_letters + string.digits
34
+ medium_passwords.append(''.join(random.choices(allowed_chars, k=length)))
35
+
36
+ # Generate 100,000 strong passwords: random length between 12–16, letters, digits, and symbols
37
+ strong_passwords = [
38
+ ''.join(random.choices(string.ascii_letters + string.digits + "!@#$%", k=random.randint(12,16)))
39
+ for _ in range(100000)
40
+ ]
41
+
42
+ # Combine all passwords and assign labels: 0 (weak), 1 (medium), 2 (strong)
43
+ data = weak_passwords + medium_passwords + strong_passwords
44
+ labels = [0] * 100000 + [1] * 100000 + [2] * 100000
45
+
46
+ # Define function to extract password features for ML classification.
47
+ def password_features(password: str) -> dict:
48
+ """
49
+ Extracts features from a password for strength classification.
50
+ Returns a dictionary with password features.
51
+ """
52
+ features = {
53
+ "length": len(password),
54
+ "entropy": math.log2(len(set(password)) ** len(password)) if password else 0,
55
+ "has_upper": int(bool(re.search(r"[A-Z]", password))),
56
+ "has_symbol": int(bool(re.search(r"[^A-Za-z0-9]", password))),
57
+ "has_leet": int(any(c in "@3!0" for c in password)),
58
+ "repetition": int(bool(re.search(r"(.)\1{2,}", password))),
59
+ }
60
+
61
+ # Proportion of digits
62
+ num_digits = sum(1 for c in password if c.isdigit())
63
+ features["digit_ratio"] = num_digits / len(password) if password else 0
64
+
65
+ # Unique character ratio
66
+ features["unique_ratio"] = len(set(password)) / len(password) if password else 0
67
+
68
+ # Bigram entropy
69
+ if len(password) >= 2:
70
+ bigrams = [password[i:i+2] for i in range(len(password)-1)]
71
+ bigram_counts = Counter(bigrams)
72
+ total_bigrams = sum(bigram_counts.values())
73
+ features["bigram_entropy"] = -sum(
74
+ (count / total_bigrams) * math.log2(count / total_bigrams)
75
+ for count in bigram_counts.values()
76
+ ) if total_bigrams else 0
77
+ else:
78
+ features["bigram_entropy"] = 0
79
+
80
+ # Compression ratio
81
+ features["compression_ratio"] = (
82
+ len(zlib.compress(password.encode())) / len(password)
83
+ if password else 1.0
84
+ )
85
+
86
+ return features
87
+
88
+ # Extract features for all passwords and build the DataFrame.
89
+ df = pd.DataFrame([password_features(pw) for pw in data])
90
+
91
+ # Add breached status and labels to the DataFrame.
92
+ df["hibp_breached"] = [1 if label == 0 else 0 for label in labels]
93
+ df["label"] = labels
94
+
95
+ # Prepare features (X) and target (y) for training.
96
+ X = df.drop("label", axis=1)
97
+ y = df["label"]
98
+
99
+ # Split into training and test sets for evaluation.
100
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
101
+
102
+ # Initialize and train the Random Forest Classifier with tuned parameters.
103
+ model = RandomForestClassifier(
104
+ n_estimators=200,
105
+ max_depth=20, # Limit depth to prevent overfitting
106
+ min_samples_split=5, # Require at least 5 samples to split
107
+ random_state=42
108
+ )
109
+ model.fit(X_train, y_train)
110
+
111
+ # Evaluate model accuracy on the test set.
112
+ y_pred = model.predict(X_test)
113
+ accuracy = accuracy_score(y_test, y_pred)
114
+ print(f"Model accuracy: {accuracy:.2%}")
115
+
116
+ # Serialize trained model for use in the application.
117
+ joblib.dump(model, "password_health_model.pkl")
118
+ print("Model saved as 'password_health_model.pkl'")