Skip to content

mv-tpe-nlp-study #165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
david-thrower opened this issue Apr 4, 2025 · 0 comments
Open

mv-tpe-nlp-study #165

david-thrower opened this issue Apr 4, 2025 · 0 comments
Labels
audience/technical Issue primarily for technical review and service. kind/cicd CICD, dev ops, platform ops, etc kind/performance kind/validation triage/help wanted Extra attention is needed triage/high-priority

Comments

@david-thrower
Copy link
Owner

david-thrower commented Apr 4, 2025

Complete this ...

The code thus far


# Hard set these 2 params and do separate studies on each 
MINIMUM_LEVELS = 2
MAXIMUM_LEVELS = 2
NAMESPACE = "kubeflow"
JOB_NAME = "NLPtrainTask0001"


def objective(
        parameters, 
        
        # Hard set these 2 params and do separate studies on each
        minimum_levels=MINIMUM_LEVELS, 
        maximum_levels=MAXIMUM_LEVELS):

    import tensorflow as tf
    import tensorflow_text
    from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor
    from keras_nlp.layers import PositionEmbedding
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Input, Flatten
    import pandas as pd
    import numpy as np
    from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
            import SimpleCerebrosRandomSearch
    import pendulum
    from cerebros.units.units import DenseUnit
    from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
            import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
    from ast import literal_eval
    import time
    
    embedding_dim =\
            parameters["embedding_dim"]
    activation =\
            parameters["activation"]
    predecessor_level_connection_affinity_factor_first =\
            parameters["predecessor_level_connection_affinity_factor_first"]
    predecessor_level_connection_affinity_factor_main =\
            parameters["predecessor_level_connection_affinity_factor_main"] 
    max_consecutive_lateral_connections =\
            parameters["max_consecutive_lateral_connections"]
    p_lateral_connection =\
            parameters["p_lateral_connection"]
    num_lateral_connection_tries_per_unit =\
            parameters["num_lateral_connection_tries_per_unit"]
    learning_rate =\
            parameters["learning_rate"]
    epochs =\
            parameters["epochs"]
    batch_size =\
            parameters["batch_size"]
    # Add a second dropout so dropout can be implemented within
    # cerebros blocks and can separate from the beachhead
    # dropout layer.  
    dropout =\
            parameters["dropout"]
    maximum_units_per_level =\
            parameters["maximum_units_per_level"]
    maximum_neurons_per_unit =\
            parameters["maximum_neurons_per_unit"] 


    df = pd.read_csv("Phishing_Email.csv")
    df = df[df['Email Text'].apply(lambda x: isinstance(x, str))]
    df.reset_index(drop=True, inplace=True)
    label_mapping = {"Safe Email": 0, "Phishing Email": 1}
    df["Binary Label"] = df["Email Type"].map(label_mapping)
    X = df["Email Text"].to_numpy()
    y = df["Binary Label"].to_numpy()
    X, y = shuffle(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, shuffle=False)

    baseline_train_x = tf.constant(X_train)
    baseline_train_y = tf.constant(y_train, dtype=tf.int8)

    training_x  = [baseline_train_x]
    train_labels = [baseline_train_y]

    INPUT_SHAPES  = [()]
    OUTPUT_SHAPES = [1]

    # -------------------------------- GPT2 Model Definition --------------------------------
    class TokenizerLayer(tf.keras.layers.Layer):  # TokenizerLayer definition (as before)
        def __init__(self, max_seq_length, **kwargs):
            super(TokenizerLayer, self).__init__(**kwargs)
            self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en")
            self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
            self.max_seq_length = max_seq_length

        def call(self, inputs):
            prep = self.preprocessor([inputs])
            return prep['token_ids']

        def get_config(self):
            config = super(TokenizerLayer, self).get_config()
            config.update({'max_seq_length': self.max_seq_length})
            return config

        @classmethod
        def from_config(cls, config):
            return cls(max_seq_length=config['max_seq_length'])

    max_seq_length = 1024

    inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
    gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
    VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size()
    tokens = gp2_tokenizer(inp)

    embedded =\
            tf.keras.layers.Embedding(
                    input_dim=VOCABULARY_SIZE,
                    output_dim=embedding_dim,
                    input_length=max_seq_length,
                    mask_zero=True)(tokens)
    position_embedding =\
            PositionEmbedding(
                    sequence_length=max_seq_length,
                    initializer="uniform")(embedded)

    x = tf.keras.layers.Concatenate()([
             embedded,
             position_embedding])
    x = tf.keras.layers.Dropout(dropout)(x)
    flattened = tf.keras.layers.Flatten()(x)

    cerebros_base_model = tf.keras.Model(inputs=inp, outputs=flattened)

    # -------------------------------- Cerebros AutoML Search --------------------------------

    TIME = pendulum.now(tz='America/New_York').__str__()[:16].replace('T', '_').replace(':', '_').replace('-', '_')
    PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
    meta_trial_number = 42  # irrelevant unless in distributed training

    cerebros_automl = SimpleCerebrosRandomSearch(
        unit_type=DenseUnit,
        input_shapes=INPUT_SHAPES,
        output_shapes=OUTPUT_SHAPES,
        training_data=training_x,
        labels=train_labels,
        validation_split=0.35,
        direction='maximize',
        metric_to_rank_by="val_binary_accuracy",
        minimum_levels=minimum_levels,
        maximum_levels=maximum_levels,
        minimum_units_per_level=4,
        maximum_units_per_level=maximum_units_per_level,
        minimum_neurons_per_unit=1,
        maximum_neurons_per_unit=maximum_neurons_per_unit,
        activation=activation,
        final_activation='sigmoid',
        number_of_architecture_moities_to_try=5,
        number_of_tries_per_architecture_moity=1,
        minimum_skip_connection_depth=1,
        maximum_skip_connection_depth=7,
        predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
        predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
        predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
        predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
        predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
        seed=8675309,
        max_consecutive_lateral_connections=max_consecutive_lateral_connections,
        gate_after_n_lateral_connections=3,
        gate_activation_function=simple_sigmoid,
        p_lateral_connection=p_lateral_connection,
        p_lateral_connection_decay=zero_95_exp_decay,
        num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
        learning_rate=learning_rate,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.BinaryAccuracy(),
                 tf.keras.metrics.Precision(),
                 tf.keras.metrics.Recall()],
        epochs=epochs,
        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
        model_graphs='model_graphs',
        batch_size=batch_size,
        meta_trial_number=meta_trial_number,
        base_models=[cerebros_base_model],
        train_data_dtype=tf.string
    )

    # -------------------------------- Run Search and Report Metric --------------------------------
    cerebros_t0 = time.time()
    result = cerebros_automl.run_random_search()
    cerebros_t1 = time.time()
    cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
    models_tried = 5  #moities_to_try * tries_per_moity #Corrected
    cerebros_time_per_model = cerebros_time_all_models_min / models_tried

    print(f"val_binary_accuracy={result}")


import kubeflow.katib as katib

# [2] Create hyperparameter search space.
parameters = {
    "embedding_dim": katib.search.int(min=10, max=50, step=1),
    "activation": katib.search.categorical(["relu","gelu", "elu"]),
    "predecessor_level_connection_affinity_factor_first": katib.search.double(min=0.1, max=50, step=0.1),
    "predecessor_level_connection_affinity_factor_main": katib.search.double(min=0.1, max=50, step=0.1),
    "max_consecutive_lateral_connections": katib.search.int(min=1, max=50, step=1),
    "p_lateral_connection":  katib.search.double(min=0.1, max=50, step=0.1),
    "num_lateral_connection_tries_per_unit": katib.search.int(min=1, max=50, step=1),
    "learning_rate": katib.search.double(min=10 ** -5, max=0.3, step=10 ** -5),
    "epochs": katib.search.int(min=1, max=25, step=1),
    "batch_size": katib.search.int(min=1, max=35, step=1),
    "dropout":  katib.search.double(min=0.05, max=0.95, step=0.05),
    "maximum_units_per_level": katib.search.int(min=5, max=10, step=1),
    "maximum_neurons_per_unit": katib.search.int(min=1, max=9, step=1)
}



# [3] Create Katib Experiment: MV-TPE, 7 random trials
# Up to 24 considered in suggestion search,
# 2 concurrent and up to 25 trials in total (maybe change this to 15, 25 will take about 2 days to run). 
katib_client = katib.KatibClient(namespace=NAMESPACE)


algorithm_config = {
    "algorithm_name": "multivariate-tpe",
    "algorithm_settings": [
        {"name": "n_startup_trials", "value": "7"},
        {"name": "n_ei_candidates", "value": "24"},
        {"name": "random_state", "value": "42"}
    ]
}



katib_client.tune(
    name=JOB_NAME,
    objective=objective,
    parameters=parameters,
    objective_metric_name="val_binary_accuracy",
    algorithm_name = "multivariate-tpe",
    algorithm_settings= algorithm_config,
    max_trial_count=25,
    parallel_trial_count=2,
    resources_per_trial={"cpu": "8", "memory": "24Gi"},
)

# [4] Wait until Katib Experiment is complete
katib_client.wait_for_experiment_condition(name=JOB_NAME)

# [5] Get the best hyperparameters.
print(katib_client.get_optimal_hyperparameters(JOB_NAME))
@david-thrower david-thrower added triage/help wanted Extra attention is needed audience/technical Issue primarily for technical review and service. triage/high-priority kind/validation kind/performance kind/cicd CICD, dev ops, platform ops, etc labels Apr 12, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
audience/technical Issue primarily for technical review and service. kind/cicd CICD, dev ops, platform ops, etc kind/performance kind/validation triage/help wanted Extra attention is needed triage/high-priority
Projects
None yet
Development

No branches or pull requests

1 participant
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy