import pandas as pd from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score import matplotlib.pyplot as plt import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import pandas as pd import ast # Load the dataset file_path = 'spectraldb.csv' # Replace with the actual file path spectral_data = pd.read_csv(file_path) # Step 1: Remove rows with missing SCIMeasures or LAB values spectral_data_cleaned = spectral_data.dropna(subset=['SCIMeasures', 'L', 'a', 'b']).copy() # Step 2: Convert 'SCIMeasures' (which is a string representation of a dictionary) to a sorted list of spectral values def extract_spectral_data(spectral_column): # Convert string representation of dictionary to an actual dictionary and sort by wavelength return spectral_column.apply(lambda x: {int(k): v for k, v in sorted(ast.literal_eval(x).items())}) # Apply the function to extract SCIMeasures spectral_data_cleaned['SCIMeasures'] = extract_spectral_data(spectral_data_cleaned['SCIMeasures']) # Step 3: Filter the data to only include samples where the number of spectral measurements is 39 spectral_data_filtered = spectral_data_cleaned[spectral_data_cleaned['SCIMeasures'].apply(len) == 39].copy() # Extract relevant information: Sample ID and Name sample_id = spectral_data_filtered['ID'] sample_name = spectral_data_filtered['Name'] # Extract the SCIMeasures and create a DataFrame with 39 wavelengths wavelengths_39 = sorted(list(spectral_data_filtered['SCIMeasures'].iloc[0].keys())) spectral_df_39 = pd.DataFrame(columns=wavelengths_39) # Iterate through each row and add the spectral values for the 39 wavelengths for i, row in spectral_data_filtered.iterrows(): spectral_values = row['SCIMeasures'] row_data = {wavelength: spectral_values[wavelength] for wavelength in wavelengths_39} spectral_df_39.loc[i] = row_data.values() # Add the Sample ID, Name, and LAB values spectral_df_39.insert(0, 'Sample ID', sample_id) spectral_df_39.insert(1, 'Sample Name', sample_name) spectral_df_39[['L', 'a', 'b']] = spectral_data_filtered[['L', 'a', 'b']] # Save the filtered DataFrame to a CSV file output_file_filtered = 'refined_spectral_data.csv' spectral_df_39.to_csv(output_file_filtered, index=False) print(f"Filtered data saved to {output_file_filtered}") # Load the filtered dataset with 39 spectral measurements per sample file_path = 'refined_spectral_data.csv' # Path to your new filtered CSV file spectral_data = pd.read_csv(file_path) # Extract the spectral data (wavelengths) and LAB values # Assuming the spectral data is in the first 39 columns, and LAB values are in the last 3 columns X = spectral_data[['L', 'a', 'b']].values # LAB values as input y = spectral_data.iloc[:, 2:41].values # Spectral values as target (assuming they are in columns 3 to 41) # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Standardize LAB values (input) scaler_X = StandardScaler() X_train_scaled = scaler_X.fit_transform(X_train) X_test_scaled = scaler_X.transform(X_test) # Standardize spectral data (output) scaler_y = StandardScaler() y_train_scaled = scaler_y.fit_transform(y_train) y_test_scaled = scaler_y.transform(y_test) # Define the model with an explicit Input layer model = Sequential([ Input(shape=(X_train_scaled.shape[1],)), # Input layer Dense(64, activation='relu'), # Hidden layer 1 Dense(128, activation='relu'), # Hidden layer 2 Dense(256, activation='relu'), # Hidden layer 3 Dense(y_train_scaled.shape[1]) # Output layer for 39 spectral values ]) # Compile the model model.compile(optimizer='adam', loss='mean_squared_error') # Train the model and save training history history = model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=16, validation_split=0.1) # Evaluate the model on test data loss = model.evaluate(X_test_scaled, y_test_scaled) print(f"Test MSE: {loss}") # Predict the spectral data using the trained model y_pred_scaled = model.predict(X_test_scaled) # Inverse transform the predicted and test data to get the original scale y_pred = scaler_y.inverse_transform(y_pred_scaled) y_test_original = scaler_y.inverse_transform(y_test_scaled) # Calculate metrics for both training and testing data y_train_pred_scaled = model.predict(X_train_scaled) y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled) y_train_original = scaler_y.inverse_transform(y_train_scaled) # Calculate metrics train_mse = mean_squared_error(y_train_original, y_train_pred) test_mse = mean_squared_error(y_test_original, y_pred) train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred) test_mape = mean_absolute_percentage_error(y_test_original, y_pred) train_r2 = r2_score(y_train_original, y_train_pred) test_r2 = r2_score(y_test_original, y_pred) # Display the results print(f"Training MSE: {train_mse}, Testing MSE: {test_mse}") print(f"Training MAPE: {train_mape}, Testing MAPE: {test_mape}") print(f"Training R2: {train_r2}, Testing R2: {test_r2}") # Plot training & validation loss values (only the loss) plt.figure(figsize=(8, 6)) plt.plot(history.history['loss'], label='Train Loss') plt.plot(history.history['val_loss'], label='Validation Loss') plt.title('Model Loss') plt.xlabel('Epoch') plt.ylabel('Loss (MSE)') plt.legend(loc='upper right') plt.show() # Select 20 random colors from the test set num_samples = 20 random_indices = np.random.choice(len(y_test_original), num_samples, replace=False) # Randomly select 20 indices y_pred_sample = y_pred[random_indices] # Predicted spectral data for the selected 20 samples y_test_original_sample = y_test_original[random_indices] # Actual spectral data for the selected 20 samples # Get the Sample IDs and Sample Names for the selected 20 samples sample_ids = spectral_data['Sample ID'].values[random_indices] sample_names = spectral_data['Sample Name'].values[random_indices] # Define the correct wavelengths for the 39 spectral measurements wavelengths = np.linspace(360, 780, 39) # Create the plot with 4 columns and 5 rows (20 plots in total) fig, axes = plt.subplots(4, 5, figsize=(18, 20)) # Reduce font size for the ticks and labels plt.rc('font', size=10) for i in range(num_samples): row = i // 5 col = i % 5 axes[row, col].plot(wavelengths, y_test_original_sample[i], label='Actual', color='blue') axes[row, col].plot(wavelengths, y_pred_sample[i], label='Predicted', linestyle='dashed', color='red') axes[row, col].set_xlabel('Wavelength (nm)', fontsize=8) axes[row, col].set_ylabel('Reflectance', fontsize=8) axes[row, col].tick_params(axis='both', which='major', labelsize=6) # Reduce tick font size axes[row, col].text(0.5, -0.15, f"ID: {sample_ids[i]}, Name: {sample_names[i]}", fontsize=8, ha='center', transform=axes[row, col].transAxes) # Create a single legend for the entire plot in the top-right corner lines, labels = axes[0, 0].get_legend_handles_labels() # Get handles and labels from one plot fig.legend(lines, labels, loc='upper right', fontsize=8) # Adjust layout to avoid overlap plt.tight_layout() plt.show() # Save the model in Keras format model.save('spectral_reconstruction_model.keras') # Load the model loaded_model = tf.keras.models.load_model('spectral_reconstruction_model.keras') # Compile the model after loading, if needed loaded_model.compile(optimizer='adam', loss='mean_squared_error') # View the summary of the model model.summary()