# Step 1: Setup
!pip install transformers
!pip install datasets
# Step 2: Mount Google Drive to access your data
from google.colab import drive
drive.mount('/content/drive')
# Step 3: Prepare Your Data
from datasets import load_dataset
# Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive
dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/path_to_your_text_file.txt', 'validation': '/content/drive/My Drive/path_to_your_text_file.txt'})
dataset = dataset.train_test_split(test_size=0.1)
# Step 4: Tokenization
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def tokenize_function(examples):
return tokenizer(examples["text"])
dataset = dataset.map(tokenize_function, batched=True)
# Step 5: Model Preparation
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Step 6: Training
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=1,
save_steps=10_000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
)
trainer.train()
# Step 7: Testing
from transformers import pipeline
generator = pipeline('text-generation', model=model)
result = generator('My custom model says,')[0]
print(result['generated_text'])