# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# load the dataset
data = load_breast_cancer()

# convert the data to a Pandas dataframe
df = pd.DataFrame(data.data, columns=data.feature_names)

# replace 20% features values with NaN
mask = np.random.rand(*df.shape) < 0.2
df[mask] = np.nan

# add the target column to the dataframe
df['target'] = data.target


# import necessary libraries
from sklearn.model_selection import train_test_split

# get all feature variables of the dataset
X = df.iloc[:, 0:-1]

# get target variable of the dataset
y = df.iloc[:, -1]

# split the data, let 20% data go to testing
X_train, X_test, y_train, y_test = train_test_split(
                                  X, y, 
                                  test_size = 0.2, 
                                  random_state = 42
                                )