from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
def fine_tune_gpt(data, model_name='gpt2', epochs=1):
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
train_data, val_data = train_test_split(data, test_size=0.1)
train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_data, block_size=128)
val_dataset = TextDataset(tokenizer=tokenizer, file_path=val_data, block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
search_space = {'learning_rate': Real(1e-5, 1e-3, prior='log-uniform'),
'num_train_epochs': Integer(1, 5),
'per_device_train_batch_size': Integer(4, 16)}
bayes_search = BayesSearchCV(model, search_space, n_iter=10, cv=3, n_jobs=-1, random_state=42)
bayes_search.fit(train_dataset, val_dataset, data_collator=data_collator)
best_params = bayes_search.best_params_
return best_params