"""This submodule defines the `BayesianMODNetModel`, an extension to the vanilla
model that incorporates probabilistic `DenseVariational` layers from TensorFlow
Probability.

"""

import warnings
from functools import partial
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf

try:
    import tensorflow_probability as tfp
except ImportError:
    raise ImportError(
        "`tensorflow-probability` is required for Bayesian models: install modnet[bayesian]."
    )

from modnet import __version__
from modnet.models.vanilla import MODNetModel
from modnet.preprocessing import MODData

__all__ = ("BayesianMODNetModel",)


class BayesianMODNetModel(MODNetModel):
    """Container class for the underlying Probabilistic Bayesian Neural Network, that handles
    setting up the architecture, activations, training and learning curve. Only epistemic uncertainty is taken into account.

    Attributes:
        n_feat: The number of features used in the model.
        weights: The relative loss weights for each target.
        optimal_descriptors: The list of column names used
            in training the model.
        model: The `keras.model.Model` of the network itself.
        target_names: The list of targets names that the model
            was trained for.

    """

    can_return_uncertainty = True

    def __init__(
        self,
        targets: List,
        weights: Dict[str, float],
        num_neurons=([64], [32], [16], [16]),
        num_classes: Optional[Dict[str, int]] = None,
        n_feat: Optional[int] = 64,
        act: str = "relu",
        out_act: str = "linear",
        bayesian_layers=None,
        prior=None,
        posterior=None,
        kl_weight=None,
    ):
        """Initialise the model on the passed targets with the desired
        architecture, feature count and loss functions and activation functions.

        Parameters:
            targets: A nested list of targets names that defines the hierarchy
                of the output layers.
            weights: The relative loss weights to apply for each target.
            num_neurons: A specification of the model layers, as a 4-tuple
                of lists of integers. Hidden layers are split into four
                blocks of `tf.keras.layers.Dense`, with neuron count specified
                by the elements of the `num_neurons` argument.
            bayesian_layers: Same shape as num_neurons, with True for a Bayesian DenseVariational layer,
                False for a normal Dense layer. Default is None and will only set last layer as Bayesian.
            prior: Prior to use for the DenseVariational layers, default is independent normal with learnable mean.
            posterior: Posterior to use for the DenseVariational layers, default is indepent normal with learnable mean and variance.
            kl_weight: Amount by which to scale the KL divergence loss between prior and posterior.
            num_classes: Dictionary defining the target types (classification or regression).
                Should be constructed as follows: key: string giving the target name; value: integer n,
                 with n=0 for regression and n>=2 for classification with n the number of classes.
            n_feat: The number of features to use as model inputs.
            act: A string defining a tf.keras activation function to pass to use
                in the `tf.keras.layers.Dense` layers.
            out_act: A string defining a tf.keras activation function to pass to use
                for the last output layer

        """

        warnings.warn(
            "BayesianMODNetModel is deprecated and may be removed in the future.",
            DeprecationWarning,
        )

        self.__modnet_version__ = __version__

        if n_feat is None:
            n_feat = 64
        self.n_feat = n_feat
        self.weights = weights
        self.num_classes = num_classes
        self.num_neurons = num_neurons
        self.act = act
        self.out_act = out_act

        self._scaler = None
        self.optimal_descriptors = None
        self.target_names = None
        self.targets = targets
        self.model = None

        self.targets_groups = [x for subl in targets for x in subl]
        self.targets_flatten = [x for subl in self.targets_groups for x in subl]
        self.num_classes = {name: 0 for name in self.targets_flatten}
        if num_classes is not None:
            self.num_classes.update(num_classes)
        self._multi_target = len(self.targets_flatten) > 1
        self.multi_label = False  # forced for compatibility with vanilla

        self.model = self.build_model(
            targets,
            n_feat,
            num_neurons,
            bayesian_layers=bayesian_layers,
            prior=prior,
            posterior=posterior,
            kl_weight=kl_weight,
            act=act,
            out_act=out_act,
            num_classes=self.num_classes,
        )

    def build_model(
        self,
        targets: List,
        n_feat: int,
        num_neurons: Tuple[List[int], List[int], List[int], List[int]],
        bayesian_layers=None,
        prior=None,
        posterior=None,
        kl_weight=None,
        num_classes: Optional[Dict[str, int]] = None,
        act: str = "relu",
        out_act: str = "relu",
    ):
        """Builds the Bayesian Neural Network and sets the `self.model` attribute.

        Parameters:
            targets: A nested list of targets names that defines the hierarchy
                of the output layers.
            n_feat: The number of features to use as model inputs.
            num_neurons: A specification of the model layers, as a 4-tuple
                of lists of integers. Hidden layers are split into four
                blocks of `keras.layers.Dense`, with neuron count specified
                by the elements of the `num_neurons` argument.
            num_classes: Dictionary defining the target types (classification or regression).
                Should be constructed as follows: key: string giving the target name; value: integer n,
                with n=0 for regression and n>=2 for classification with n the number of classes.
            act: A string defining a Keras activation function to pass to use
                in the `keras.layers.Dense` layers.
            out_act: A string defining a tf.keras activation function to pass to use
                for the last output layer

        """

        num_layers = [len(x) for x in num_neurons]

        # define probabilistic layers
        tfd = tfp.distributions

        if bayesian_layers is None:
            bayesian_layers = [[False] * nl for nl in num_layers]

        if posterior is None:

            def posterior(kernel_size, bias_size=0, dtype=None):
                n = kernel_size + bias_size
                c = np.log(np.expm1(1.0))
                return tf.keras.Sequential(
                    [
                        tfp.layers.VariableLayer(2 * n, dtype=dtype),
                        tfp.layers.DistributionLambda(
                            lambda t: tfd.Independent(
                                tfd.Normal(
                                    loc=t[..., :n],
                                    scale=1e-5 + tf.nn.softplus(c + t[..., n:]),
                                ),
                                reinterpreted_batch_ndims=1,
                            )
                        ),
                    ]
                )

        if prior is None:

            def prior(kernel_size, bias_size=0, dtype=None):
                n = kernel_size + bias_size
                return tf.keras.Sequential(
                    [
                        tfp.layers.VariableLayer(n, dtype=dtype),
                        tfp.layers.DistributionLambda(
                            lambda t: tfd.Independent(
                                tfd.Normal(loc=t, scale=1), reinterpreted_batch_ndims=1
                            )
                        ),
                    ]
                )

        bayesian_layer = partial(
            tfp.layers.DenseVariational,
            make_posterior_fn=posterior,
            make_prior_fn=prior,
            kl_weight=1 / 3619,
            activation=act,
        )
        dense_layer = partial(tf.keras.layers.Dense, activation=act)

        # Build first common block
        f_input = tf.keras.layers.Input(shape=(n_feat,))
        previous_layer = f_input
        for i in range(num_layers[0]):
            if bayesian_layers[0][i]:
                previous_layer = bayesian_layer(num_neurons[0][i])(previous_layer)
            else:
                previous_layer = dense_layer(num_neurons[0][i])(previous_layer)
            if self._multi_target:
                previous_layer = tf.keras.layers.BatchNormalization()(previous_layer)
        common_out = previous_layer

        # Build intermediate representations
        intermediate_models_out = []
        for _ in range(len(targets)):
            previous_layer = common_out
            for j in range(num_layers[1]):
                if bayesian_layers[1][j]:
                    previous_layer = bayesian_layer(num_neurons[1][j])(previous_layer)
                else:
                    previous_layer = dense_layer(num_neurons[1][j])(previous_layer)
                if self._multi_target:
                    previous_layer = tf.keras.layers.BatchNormalization()(
                        previous_layer
                    )
            intermediate_models_out.append(previous_layer)

        # Build outputs
        final_out = []
        output_names = []
        for group_idx, group in enumerate(targets):
            for prop_idx in range(len(group)):
                previous_layer = intermediate_models_out[group_idx]
                for k in range(num_layers[2]):
                    if bayesian_layers[2][k]:
                        previous_layer = bayesian_layer(num_neurons[2][k])(
                            previous_layer
                        )
                    else:
                        previous_layer = dense_layer(num_neurons[2][k])(previous_layer)
                    if self._multi_target:
                        previous_layer = tf.keras.layers.BatchNormalization()(
                            previous_layer
                        )
                n = num_classes[group[prop_idx][0]]
                name = group[prop_idx][0]
                if n >= 2:
                    out = tfp.layers.DenseVariational(
                        n,
                        make_posterior_fn=posterior,
                        make_prior_fn=prior,
                        kl_weight=kl_weight,
                        activation="softmax",
                        name=name,
                    )(previous_layer)
                else:
                    out = tfp.layers.DenseVariational(
                        len(group[prop_idx]),
                        make_posterior_fn=posterior,
                        make_prior_fn=prior,
                        kl_weight=kl_weight,
                        activation=out_act,
                        name=name,
                    )(previous_layer)
                final_out.append(out)
                output_names.append(name)

        new_weights = dict()
        for n in output_names:
            w = self.weights.get(n, 1)
            new_weights[n] = w
        self.weights = new_weights

        return tf.keras.models.Model(inputs=f_input, outputs=final_out)

    def predict(
        self, test_data: MODData, return_prob=False, return_unc=False
    ) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.
            return_prob: For a classification tasks only: whether to return the probability of each
                class OR only return the most probable class.
            return_unc: whether to return the standard deviation as a second dataframe

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.
            If return_unc=True, two `pandas.DataFrame` : (predictions,std) containing the predicted values of the targets and
             the standard deviations of the epistemic uncertainty.


        """
        # prevents Nan predictions if some features are inf
        x = (
            test_data.get_featurized_df()
            .replace([np.inf, -np.inf, np.nan], 0)[
                self.optimal_descriptors[: self.n_feat]
            ]
            .values
        )

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x)

        all_predictions = []

        for i in range(1000):
            p = self.model.predict(x)
            if len(self.targets_groups) == 1:
                p = np.array([p])
            all_predictions.append(p)

        p_dic = {}
        unc_dic = {}
        for i, props in enumerate(self.targets_groups):
            name = props[0]
            if self.num_classes[name] >= 2:
                if return_prob:
                    preds = np.array([pred[i] for pred in all_predictions])
                    probs = preds / (preds.sum(axis=-1)).reshape((-1, 1))
                    mean_prob = probs.mean()
                    std_prob = probs.std()
                    for j in range(mean_prob.shape[-1]):
                        p_dic["{}_prob_{}".format(name, j)] = mean_prob[:, j]
                        unc_dic["{}_prob_{}".format(name, j)] = std_prob[:, j]
                else:
                    p_dic[name] = np.argmax(
                        np.array([pred[i] for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
                    unc_dic[name] = np.max(
                        np.array([pred[i] for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
            else:
                for j, name in enumerate(props):
                    mean_p = np.array([pred[i][:, j] for pred in all_predictions]).mean(
                        axis=0
                    )
                    std_p = np.array([pred[i][:, j] for pred in all_predictions]).std(
                        axis=0
                    )
                    p_dic[name] = mean_p
                    unc_dic[name] = std_p

        predictions = pd.DataFrame(p_dic)
        unc = pd.DataFrame(unc_dic)

        predictions.index = test_data.structure_ids
        unc.index = test_data.structure_ids

        if return_unc:
            return predictions, unc
        else:
            return predictions

    def fit_preset(*args, **kwargs):
        """Not implemented"""

        raise RuntimeError("Not implemented.")

    def save(self, filename: str):
        raise RuntimeError("Save not implemented for Bayesian model")

    def load(filename: str):
        raise RuntimeError("Load not implemented for Bayesian model")