Enterprise AutoML System
Production-grade automated machine learning system with neural architecture search, hyperparameter optimization, and automated feature engineering
System Architecture
A comprehensive AutoML system that automates the entire machine learning pipeline from data preprocessing to model deployment.
Core Components
1. Neural Architecture Search
class NASController:
def __init__(self, config: Dict[str, Any]):
self.search_space = ModelSearchSpace(
min_layers=config['min_layers'],
max_layers=config['max_layers'],
operations=config['operations']
)
self.controller = LSTMController(
input_size=config['input_size'],
hidden_size=config['hidden_size']
)
self.optimizer = torch.optim.Adam(
self.controller.parameters(),
lr=config['learning_rate']
)
def sample_architecture(self) -> ModelArchitecture:
actions = []
hidden = None
# Sample architecture sequentially
for _ in range(self.search_space.max_decisions):
logits, hidden = self.controller(
self._encode_current_state(actions),
hidden
)
action = torch.multinomial(
F.softmax(logits, dim=-1),
num_samples=1
)
actions.append(action)
return self.search_space.decode_actions(actions)
def update_controller(
self,
architectures: List[ModelArchitecture],
performances: List[float]
) -> float:
# Compute rewards
rewards = self._compute_rewards(performances)
# Update controller using REINFORCE
loss = 0
for arch, reward in zip(architectures, rewards):
actions = self.search_space.encode_architecture(arch)
log_probs = self._compute_log_probs(actions)
loss -= (log_probs * reward).mean()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
2. Hyperparameter Optimization
class BayesianOptimizer:
def __init__(self, config: Dict[str, Any]):
self.space = HyperparameterSpace(config['param_space'])
self.gaussian_process = GaussianProcessRegressor(
kernel=config['kernel'],
alpha=config['noise']
)
self.acquisition = UpperConfidenceBound(
beta=config['beta']
)
def suggest_hyperparameters(
self,
n_suggestions: int = 1
) -> List[Dict[str, Any]]:
if len(self.trials) < self.n_random_starts:
return self._random_suggestions(n_suggestions)
# Optimize acquisition function
X = self._optimize_acquisition(n_suggestions)
# Convert to hyperparameter space
return [
self.space.transform(x)
for x in X
]
def update(
self,
hyperparameters: List[Dict[str, Any]],
scores: List[float]
) -> None:
X = np.array([
self.space.flatten(hp)
for hp in hyperparameters
])
y = np.array(scores)
# Update Gaussian Process
if self.gaussian_process.X_train_ is None:
self.gaussian_process.fit(X, y)
else:
X_train = np.vstack([
self.gaussian_process.X_train_,
X
])
y_train = np.concatenate([
self.gaussian_process.y_train_,
y
])
self.gaussian_process.fit(X_train, y_train)
3. Feature Engineering
class AutoFeatureEngineer:
def __init__(self, config: Dict[str, Any]):
self.transformers = {
'numeric': NumericTransformer(config['numeric_config']),
'categorical': CategoricalTransformer(config['categorical_config']),
'temporal': TemporalFeatureGenerator(config['temporal_config']),
'text': TextFeatureGenerator(config['text_config'])
}
self.feature_selector = FeatureSelector(
selection_method=config['selection_method']
)
def generate_features(
self,
data: pd.DataFrame
) -> Tuple[pd.DataFrame, FeatureMetadata]:
# Generate features for each type
feature_sets = []
metadata = []
for col_type, transformer in self.transformers.items():
cols = self._get_columns_of_type(data, col_type)
if cols:
features, meta = transformer.transform(data[cols])
feature_sets.append(features)
metadata.extend(meta)
# Combine features
all_features = pd.concat(feature_sets, axis=1)
# Select best features
selected_features = self.feature_selector.select(
all_features,
target=data[self.target_column]
)
return selected_features, metadata
class FeatureSelector:
def __init__(self, selection_method: str):
self.methods = {
'mutual_info': self._mutual_information_selection,
'lasso': self._lasso_selection,
'tree': self._tree_importance_selection
}
self.method = self.methods[selection_method]
def select(
self,
features: pd.DataFrame,
target: pd.Series,
k: Optional[int] = None
) -> pd.DataFrame:
importance_scores = self.method(features, target)
if k is None:
k = self._estimate_optimal_features(
importance_scores
)
selected_cols = importance_scores.nlargest(k).index
return features[selected_cols]
4. Pipeline Optimization
class PipelineOptimizer:
def __init__(self, config: Dict[str, Any]):
self.nas_controller = NASController(config['nas_config'])
self.hpo_optimizer = BayesianOptimizer(config['hpo_config'])
self.feature_engineer = AutoFeatureEngineer(config['feature_config'])
self.evaluator = ModelEvaluator(config['eval_config'])
async def optimize_pipeline(
self,
dataset: Dataset,
time_budget: int
) -> Tuple[Pipeline, float]:
start_time = time.time()
best_pipeline = None
best_score = float('-inf')
while time.time() - start_time < time_budget:
# Sample architecture and hyperparameters
architecture = self.nas_controller.sample_architecture()
hyperparameters = self.hpo_optimizer.suggest_hyperparameters()
# Generate features
features, _ = self.feature_engineer.generate_features(
dataset.train_data
)
# Build and evaluate pipeline
pipeline = self._build_pipeline(
architecture,
hyperparameters,
features
)
score = await self.evaluator.evaluate(pipeline, dataset)
# Update optimizers
self.nas_controller.update([architecture], [score])
self.hpo_optimizer.update([hyperparameters], [score])
# Update best pipeline
if score > best_score:
best_pipeline = pipeline
best_score = score
return best_pipeline, best_score
Usage Example
# Initialize AutoML system
config = {
'nas_config': {
'min_layers': 2,
'max_layers': 10,
'operations': ['conv3x3', 'conv5x5', 'maxpool', 'avgpool'],
'input_size': 32,
'hidden_size': 100
},
'hpo_config': {
'param_space': {
'learning_rate': ('log_float', 1e-4, 1e-1),
'batch_size': ('int', 16, 256),
'optimizer': ('categorical', ['adam', 'sgd', 'rmsprop'])
}
},
'feature_config': {
'selection_method': 'mutual_info',
'numeric_config': {'max_bins': 10},
'categorical_config': {'max_categories': 20}
}
}
automl = PipelineOptimizer(config)
# Run optimization
best_pipeline, best_score = await automl.optimize_pipeline(
dataset,
time_budget=3600 # 1 hour
)
# Deploy best pipeline
deployment = ModelDeployment(best_pipeline)
await deployment.deploy()