import datasets from datasets import load_dataset from transformers import AutoTokenizer from torch.utils.data import DataLoader from transformers import AutoModelForSequenceClassification from torch.optim import AdamW from transformers import get_scheduler import torch from tqdm.auto import tqdm import evaluate import random import argparse from nltk.corpus import wordnet from nltk import word_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer random.seed(0) def example_transform(example): example["text"] = example["text"].lower() return example ### Rough guidelines --- typos # For typos, you can try to simulate nearest keys on the QWERTY keyboard for some of the letter (e.g. vowels) # You can randomly select each word with some fixed probability, and replace random letters in that word with one of the # nearest keys on the keyboard. You can vary the random probablity or which letters to use to achieve the desired accuracy. ### Rough guidelines --- synonym replacement # For synonyms, use can rely on wordnet (already imported here). Wordnet (https://www.nltk.org/howto/wordnet.html) includes # something called synsets (which stands for synonymous words) and for each of them, lemmas() should give you a possible synonym word. # You can randomly select each word with some fixed probability to replace by a synonym. def custom_transform(example): ################################ ##### YOUR CODE BEGINGS HERE ### # Design and implement the transformation as mentioned in pdf # You are free to implement any transformation but the comments at the top roughly describe # how you could implement two of them --- synonym replacement and typos. # You should update example["text"] using your transformation p_sub = 0.6 words = example["text"].split() for i in range(len(words)): w = words[i] p = random.random() if p > p_sub: continue syns = wordnet.synsets(w) if len(syns) == 0: continue random_syn = random.choice(syns) lemmas = random_syn.lemmas() if len(lemmas) == 0: continue # further p_sub of the times, simulate # a gramatical error or add more diversity # by randomly selecting a lemma p = random.random() if p > p_sub: words[i] = lemmas[0].name() else: words[i] = random.choice(lemmas).name() example["text"] = ' '.join(words) ##### YOUR CODE ENDS HERE ###### return example