import os os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' os.environ['HF_HOME'] = '/home/ht/huggingface/models' from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments,Trainer from datasets import load_dataset dataset_name = "imdb" task = "sentiment-analysis" dataset = load_dataset(dataset_name).shuffle() # 分割训练集和测试集 train_dataset = dataset["train"].select(range(20000)) # 使用部分数据加速训练 test_dataset = dataset["test"].select(range(5000)) model_name = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_name) def preprocess_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) # 应用预处理 tokenized_train = train_dataset.map(preprocess_function, batched=True) tokenized_test = test_dataset.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained( "bert-base-cased", num_labels=2, id2label={0: "negative", 1: "positive"}, label2id={"negative": 0, "positive": 1} ) # 设置训练参数 training_args = TrainingArguments( output_dir="./results", eval_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs= 10, weight_decay=0.01, save_strategy="epoch", load_best_model_at_end=True, ) # 定义评估指标 import numpy as np from sklearn.metrics import accuracy_score def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return {"accuracy": accuracy_score(labels, predictions)} trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_test, compute_metrics=compute_metrics, ) trainer.train() # 评估模型性能 eval_results = trainer.evaluate() print(f"评估结果: {eval_results}") # 保存微调后的模型 model.save_pretrained("./fine_tuned_bert_imdb") tokenizer.save_pretrained("./fine_tuned_bert_imdb") from transformers import pipeline classifier = pipeline( "text-classification", model="./fine_tuned_bert_imdb", tokenizer="./fine_tuned_bert_imdb" ) # 示例预测 print(classifier("This movie was fantastic! I loved every minute of it.")) # data = dataset["train"]["text"][:10] # inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt") # outputs = model(**inputs) # predictions = outputs.logits.argmax(dim=-1) # labels = dataset["train"]["label"][:10] # for i ,(predictions,label) in enumerate(zip(predictions, labels)): # prediction_label = "positive" if predictions == 1 else "negative" # true_lable = "positive" if label == 1 else "negative" # print(f"Example {i+1}: Prediction: {prediction_label}, True Label: {true_lable}")