Proper data preparation is crucial for accurate AQI predictions. This guide covers loading, cleaning, validating, and transforming environmental data into features suitable for machine learning models.
AQI prediction models typically require hourly or daily measurements of pollutants and meteorological data over an extended period (minimum 6-12 months for reliable training).
Create derived features that capture temporal patterns and interactions.
from sklearn.preprocessing import StandardScalerdef engineer_features(df): """Create temporal and interaction features""" df = df.copy() # Temporal features df['hour'] = df['timestamp'].dt.hour df['day_of_week'] = df['timestamp'].dt.dayofweek df['month'] = df['timestamp'].dt.month df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int) # Cyclical encoding for hour df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) # Season encoding df['season'] = df['month'].map({ 12: 0, 1: 0, 2: 0, # Winter 3: 1, 4: 1, 5: 1, # Spring 6: 2, 7: 2, 8: 2, # Summer 9: 3, 10: 3, 11: 3 # Fall }) # Lagged features (previous hours) for lag in [1, 3, 6, 12, 24]: df[f'pm25_lag_{lag}h'] = df['pm25'].shift(lag) df[f'pm10_lag_{lag}h'] = df['pm10'].shift(lag) # Rolling statistics (moving averages) for window in [3, 6, 12, 24]: df[f'pm25_rolling_mean_{window}h'] = df['pm25'].rolling(window).mean() df[f'pm25_rolling_std_{window}h'] = df['pm25'].rolling(window).std() # Interaction features df['pm_ratio'] = df['pm25'] / (df['pm10'] + 1e-5) df['temp_humidity'] = df['temperature'] * df['humidity'] # Pollutant index df['pollutant_index'] = ( df['pm25'] * 0.3 + df['pm10'] * 0.2 + df['no2'] * 0.2 + df['so2'] * 0.15 + df['co'] * 0.15 ) return dfdf_features = engineer_features(df_clean)# Drop rows with NaN from lagged/rolling featuresdf_features = df_features.dropna()print(f"Feature engineering complete: {len(df_features.columns)} features")
Lagged features and rolling statistics are particularly important for time series prediction, as they capture recent pollution trends.
5
Normalize Features
Scale features to ensure consistent ranges for model training.
from sklearn.preprocessing import StandardScaler, RobustScalerimport joblib# Separate features and targetfeature_cols = [col for col in df_features.columns if col not in ['timestamp', 'aqi']]X = df_features[feature_cols]y = df_features['aqi']# Use RobustScaler (less sensitive to outliers)scaler = RobustScaler()X_scaled = scaler.fit_transform(X)X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=X.index)# Save scaler for inferencejoblib.dump(scaler, 'models/feature_scaler.pkl')print("Feature scaling complete")# Verify scalingprint("\nScaled feature statistics:")print(X_scaled.describe())
6
Split Data
Create training, validation, and test sets with temporal awareness.
from sklearn.model_selection import train_test_splitdef temporal_train_test_split(X, y, timestamps, train_size=0.7, val_size=0.15): """Split time series data chronologically""" n = len(X) train_end = int(n * train_size) val_end = int(n * (train_size + val_size)) X_train = X.iloc[:train_end] y_train = y.iloc[:train_end] X_val = X.iloc[train_end:val_end] y_val = y.iloc[train_end:val_end] X_test = X.iloc[val_end:] y_test = y.iloc[val_end:] print(f"Train: {len(X_train)} samples ({timestamps.iloc[0]} to {timestamps.iloc[train_end-1]})") print(f"Val: {len(X_val)} samples ({timestamps.iloc[train_end]} to {timestamps.iloc[val_end-1]})") print(f"Test: {len(X_test)} samples ({timestamps.iloc[val_end]} to {timestamps.iloc[-1]})") return X_train, X_val, X_test, y_train, y_val, y_testX_train, X_val, X_test, y_train, y_val, y_test = temporal_train_test_split( X_scaled, y, df_features['timestamp'])
Never use random shuffling for time series data. Always split chronologically to avoid data leakage from future to past.