Testing of SAMueL-1 synthetic data: Compare differences between synthetic data and real data nearest-neighbours#

This notebook shows some examples of synthetic data alongside the nearest-neighbour real data used to create the synthetic data, and show the distribution of distances between real data point nearest neighbours and nearest neighbour distances between synthetic and real data points. Distance between real and synthetic data is measured as Cartesian distance between standardised data for admissions to the same stroke unit.

We look at synthetic data produced directly from real data, ‘double synthetic’ data - which is data produced from the first round of synthetic data, and ‘triple synthetic’ data - which is data produced from the double synthetic data.

Load packages#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Turn warnings off to keep notebook clean
import warnings

warnings.filterwarnings("ignore")

Load data#

data_loc = './../data/sam_1/kfold_5fold/'

train = pd.read_csv(data_loc + 'train_0.csv')
test = pd.read_csv(data_loc + 'test_0.csv')
synthetic = pd.read_csv(data_loc + 'synthetic_0.csv')
synthetic_double = pd.read_csv(data_loc + 'synthetic_double_0.csv')
synthetic_triple = pd.read_csv(data_loc + 'synthetic_triple_0.csv')

X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
X_synthetic = synthetic.drop('S2Thrombolysis', axis=1)
X_synthetic_double = synthetic_double.drop('S2Thrombolysis', axis=1)
X_synthetic_triple = synthetic_triple.drop('S2Thrombolysis', axis=1)

Functions#

Function to standardise data#

def standardise_data(X_train, X_test):
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std = sc.transform(X_train)
    test_std = sc.transform(X_test)

    return train_std, test_std

Function to find closest data points#

def find_distance_to_closest_real_data(X_actual, X_synthetic):
    """
    Find nearest neighbour in the real data set (based on Cartesian distance of
    standardised data).
    """

    # Standardise data (based on real training data)
    X_actual_std, X_synth_std = standardise_data(
        X_actual, X_synthetic)

    # Use ScitLearn neighbors.NearestNeighbors to get nearest neighbour
    nn = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(X_actual_std)
    dists, idxs = nn.kneighbors(X_synth_std)

    # Convert row idxs to index in original index in X_actual
    real_ids = list(X_actual.iloc[idxs.flatten()].index)

    # Store in dictionary
    nearest_neighbours = dict()
    nearest_neighbours['distances'] = list(dists.flatten())
    nearest_neighbours['ids'] = real_ids

    return nearest_neighbours

Get closest distances between real data points, and between synthetic and real data#

Standardise real (X_train) data and set up nearest neighbour algorithm#

X_actual_std, X_actual_std = standardise_data(
    X_train.drop('StrokeTeam', axis=1),
    X_train.drop('StrokeTeam', axis=1))

nn = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(X_actual_std)

Get nearest neighbours in real data#

# Get closest neighbours in real data
dists, idxs = nn.kneighbors()
dist_real_real = dists.flatten()
indx_real_real = idxs.flatten()

Get nearest neighbours between synthetic data and real data#

(Using synthetic data produced directly from real data).

# Get closest neighbours between synthetic and real data
X_actual_std, X_synthetic_std = standardise_data(
    X_train.drop('StrokeTeam', axis=1),
    X_synthetic.drop('StrokeTeam', axis=1))

dists, idxs = nn.kneighbors(X_synthetic_std)
dist_synthetic_real = dists.flatten()
indx_synthetic_real = idxs.flatten()

Get nearest neighbours between double synthetic data and real data#

(Using synthetic data produced from synthetic data).

# Get closest neighbours between double synthetic and real data
X_actual_std, X_synthetic_double_std = standardise_data(
    X_train.drop('StrokeTeam', axis=1),
    X_synthetic_double.drop('StrokeTeam', axis=1))

dists, idxs = nn.kneighbors(X_synthetic_double_std)
dist_synthetic_double_real = dists.flatten()
indx_synthetic_double_real = idxs.flatten()

Get nearest neighbours between triple synthetic data and real data#

(Using synthetic data produced from double synthetic data).

# Get closest neighbours between triple synthetic and real data
X_actual_std, X_synthetic_triple_std = standardise_data(
    X_train.drop('StrokeTeam', axis=1),
    X_synthetic_triple.drop('StrokeTeam', axis=1))

dists, idxs = nn.kneighbors(X_synthetic_triple_std)
dist_synthetic_triple_real = dists.flatten()
indx_synthetic_triple_real = idxs.flatten()

Plot histogram of distances to nearest real data point#

bins = np.arange(0, 12.1, 0.2)
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.hist(dist_real_real, bins=bins, histtype='step', color='b', 
        density=True, label='real-real')
ax.hist(dist_synthetic_real, bins=bins, histtype='step', color='r', 
        density=True, label='synthetic-real')
ax.hist(dist_synthetic_double_real, bins=bins, histtype='step', color='y', 
        density=True, label='double_synthetic-real')
ax.hist(dist_synthetic_triple_real, bins=bins, histtype='step', color='g', 
        density=True, label='triple_synthetic-real')

ax.set_xlabel('Distance to nearest neighbour')
ax.set_ylabel('Frequency density')
ax.legend()
plt.savefig('./images/distance_comparison.png', dpi=300)
plt.show()

../_images/02c_compare_distances_to_real_data_22_0.png

Summary statistics of distances to nearest real data point#

statistics = pd.DataFrame()
statistics['Real-Real'] = pd.DataFrame(dist_real_real).describe()
statistics['Synthetic-Real'] = pd.DataFrame(dist_synthetic_real).describe()
statistics['Double synthetic-Real'] = \
    pd.DataFrame(dist_synthetic_double_real).describe()
statistics['Triple synthetic-Real'] = \
    pd.DataFrame(dist_synthetic_triple_real).describe()

statistics

	Real-Real	Synthetic-Real	Double synthetic-Real	Triple synthetic-Real
count	71142.000000	71142.000000	71142.000000	71142.000000
mean	5.337489	1.800324	2.439078	2.795586
std	1.680298	0.883151	0.931313	0.955745
min	0.000000	0.112121	0.021724	0.455704
25%	4.186705	1.078850	1.740018	2.090752
50%	5.274546	1.677866	2.356923	2.707960
75%	6.417106	2.398763	3.038597	3.399680
max	44.942009	15.336283	22.257794	16.605757

Show proportion of points that are identical to real data#

# Count identical points between real and synthetic data
identical = np.mean(dist_synthetic_real < 0.001)
print(
    f'Proportion identical between real and synthetic data: {identical:0.3f}')

# Count identical points between real and double synthetic data
identical = np.mean(dist_synthetic_double_real < 0.001)
print(
    f'Proportion identical between real and double synthetic data: {identical:0.3f}')

Proportion identical between real and synthetic data: 0.000
Proportion identical between real and double synthetic data: 0.000

Show near neighbour examples between real data#

# Sample 4 real data points
np.random.RandomState=42
sample_index = np.random.choice(np.arange(len(X_train)), 4)
real_samples_a = X_train.iloc[sample_index]
closest_neigbour_index = indx_real_real[sample_index]
real_samples_b = X_train.iloc[closest_neigbour_index]
real_sample_differences = dist_real_real[sample_index]

# Save
real_samples_a.to_csv('./synth_examples/real_samples_a.csv')
real_samples_b.to_csv('./synth_examples/real_samples_a_nn.csv')

# Standardise
cols = list(X_train.drop('StrokeTeam', axis=1))
X_train_std, real_samples_a_std = standardise_data(
    X_train[cols], real_samples_a[cols])
X_train_std, real_samples_b_std = standardise_data(
    X_train[cols], real_samples_b[cols])

# Get axis limits
value_min = min(np.min(real_samples_a_std), np.min(real_samples_b_std)) - 0.4
value_max = max(np.max(real_samples_a_std), np.max(real_samples_b_std)) + 0.4
diff = real_samples_b_std - real_samples_a_std
diff_min = np.min(diff) - 0.2
diff_max = np.max(diff) + 0.2

fig, ax = plt.subplots(4, 2, figsize=(10, 10))
ax = ax.ravel()
for i in range(4):
    row = i
    y1 = real_samples_a_std[i]
    y2 = real_samples_b_std[i]
    x = range(len(y1))
    ax[row * 2].plot(x, y1, color='b', label='Real')
    ax[row * 2].plot(x, y2, color='r', linestyle='-', label='Closest')
    ax[row * 2].get_xaxis().set_ticks([])
    ax[row * 2].set_xlabel('Feature')
    ax[row * 2].set_ylabel('Standardised\nfeature value')
    ax[row * 2].set_ylim(value_min, value_max)
    distance = real_sample_differences[i]
    title = f'Example {i + 1} feature values (distance = {distance:0.2f})'
    ax[row * 2].set_title(title)
    ax[row * 2].legend(loc='upper right')

    y = y = real_samples_b_std[i] - real_samples_a_std[i]
    x = range(len(y1))
    ax[row * 2 + 1].plot(x, y, color='k', label='Difference')
    ax[row * 2 + 1].get_xaxis().set_ticks([])
    ax[row * 2 + 1].set_xlabel('Feature')
    ax[row * 2 + 1].set_ylabel('Diff standardised\nfeature values')
    ax[row * 2 + 1].set_ylim(diff_min, diff_max)
    title = f'Example {i + 1} feature differences (distance = {distance:0.2f})'
    ax[row * 2 + 1].set_title(title)
    ax[row * 2 + 1].legend(loc='upper right')

plt.tight_layout(pad=2)
fig.patch.set_facecolor('w')
plt.savefig('./images/nn_examples_real_to_real.png', dpi=300)
plt.show()

../_images/02c_compare_distances_to_real_data_30_0.png

Show near neighbour examples between (double) synthetic data and real data#

Double synthetic data removes the peak of instances that are in close proximity to real data, so we will proceed woth double-synthetic data.

# Sample 4 double synthetic data points
np.random.RandomState=42
sample_index = np.random.choice(np.arange(len(X_synthetic_double)), 4)
synth_examples = X_synthetic_double.iloc[sample_index]
closest_neigbour_index = indx_synthetic_double_real[sample_index]
real_examples = X_train.iloc[closest_neigbour_index]
real_synthetic_differences = dist_synthetic_double_real[sample_index]

# Save
synth_examples.to_csv('./synth_examples/synth_examples.csv')
real_examples.to_csv('./synth_examples/closest_real.csv')

# Standardise
cols = list(X_train.drop('StrokeTeam', axis=1))
X_train_std, real_examples_std = standardise_data(
    X_train[cols], real_examples[cols])
X_train_std, synth_examples_std = standardise_data(
    X_train[cols], synth_examples[cols])

# Get axis limits
value_min = min(np.min(real_examples_std), np.min(synth_examples_std)) - 0.4
value_max = max(np.max(real_examples_std), np.max(synth_examples_std)) + 0.4
diff = synth_examples_std - real_examples_std
diff_min = np.min(diff) - 0.2
diff_max = np.max(diff) + 0.2

fig, ax = plt.subplots(4, 2, figsize=(10, 10))
ax = ax.ravel()
for i in range(4):
    row = i
    y1 = real_examples_std[i]
    y2 = synth_examples_std[i]
    x = range(len(y1))
    ax[row * 2].plot(x, y1, color='b', label='Real')
    ax[row * 2].plot(x, y2, color='r', linestyle='-', label='Synthetic')
    ax[row * 2].get_xaxis().set_ticks([])
    ax[row * 2].set_xlabel('Feature')
    ax[row * 2].set_ylabel('Standardised\nfeature value')
    ax[row * 2].set_ylim(value_min, value_max)
    distance = dist_synthetic_double_real[i]
    title = f'Example {i + 1} feature values (distance = {distance:0.2f})'
    ax[row * 2].set_title(title)
    ax[row * 2].legend(loc='upper right')

    y = synth_examples_std[i] - real_examples_std[i]
    x = range(len(y1))
    ax[row * 2 + 1].plot(x, y, color='k', label='Difference')
    ax[row * 2 + 1].get_xaxis().set_ticks([])
    ax[row * 2 + 1].set_xlabel('Feature')
    ax[row * 2 + 1].set_ylabel('Diff standardised\nfeature values')
    ax[row * 2 + 1].set_ylim(diff_min, diff_max)
    title = f'Example {i + 1} feature differences (distance = {distance:0.2f})'
    ax[row * 2 + 1].set_title(title)
    ax[row * 2 + 1].legend(loc='upper right')

plt.tight_layout(pad=2)
fig.patch.set_facecolor('w')
plt.savefig('./images/nn_examples.png', dpi=300)
plt.show()

../_images/02c_compare_distances_to_real_data_34_0.png

Observations#

The nearest neighbour distance between synthetic and real data points is generally smaller than the nearest neighbour distance between real data points.
The use of double synthetic data avoids a peak of synthetic data that has close proximity to real data.
There are no identical points in the real data or between real and synthetic data (of either type).

SAMueL Stroke Audit Machine Learning 2

Testing of SAMueL-1 synthetic data: Compare differences between synthetic data and real data nearest-neighbours

Contents