!pip install kaggle
import os
# Make a directory for Kaggle API Token
os.makedirs('/root/.kaggle', exist_ok=True)
# Copy the kaggle.json to the folder expected by the Kaggle API client
!cp /content/kaggle.json /root/.kaggle/
# Secure the file
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d gpreda/star-trek-scripts -p /content
!unzip /content/star-trek-scripts.zip -d /content/star-trek-scripts
import pandas as pd
# Load the dataset into a pandas dataframe
data_path = '/content/star-trek-scripts/all_series_lines.csv'
df = pd.read_csv(data_path)
# Check the first few entries in the dataframe
print(df.head())
from transformers import DistilBertTokenizer
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Let's assume we want to consider only the 'line' column for tokenization
text_data = df['line'].astype(str).tolist() # Convert the column to a list of strings
max_length = 128 # or any other value suitable for your task
# Tokenize the text data
encoded_data = tokenizer(
text_data,
padding=True,
truncation=True,
max_length=max_length,
return_tensors='pt'
)
# Now, `encoded_data` contains input_ids, attention_mask, and other components
# required by transformers models.