# import necessary libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# load the dataset
cancer = load_breast_cancer()

# import NumPy library
import numpy as np

# separate features and target variable
X, y = cancer.data, cancer.target

# add missing values to the data
np.random.seed(42)
missing_mask = np.random.rand(*X.shape) < 0.2
X[missing_mask] = np.nan

# scale the data with weird scales
X[:, 0] *= 1000
X[:, 1] *= 100
X[:, 2] *= 10

# import train_test_split
from sklearn.model_selection import train_test_split

# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, 
                                        test_size = 0.2, 
                                        random_state = 42)

# import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# define the pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# fit the pipeline to the training data
pipe.fit(X_train, y_train)

# make predictions on the test data
y_pred = pipe.predict(X_test)


# import metrics
from sklearn.metrics import mean_squared_error, r2_score

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse:.2f}")

# calculate the R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.2f}")