import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import evaluate
import random
import argparse
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

random.seed(0)


def example_transform(example):
    example["text"] = example["text"].lower()
    return example


### Rough guidelines --- typos
# For typos, you can try to simulate nearest keys on the QWERTY keyboard for some of the letter (e.g. vowels)
# You can randomly select each word with some fixed probability, and replace random letters in that word with one of the
# nearest keys on the keyboard. You can vary the random probablity or which letters to use to achieve the desired accuracy.


### Rough guidelines --- synonym replacement
# For synonyms, use can rely on wordnet (already imported here). Wordnet (https://www.nltk.org/howto/wordnet.html) includes
# something called synsets (which stands for synonymous words) and for each of them, lemmas() should give you a possible synonym word.
# You can randomly select each word with some fixed probability to replace by a synonym.


def custom_transform(example):
    ################################
    ##### YOUR CODE BEGINGS HERE ###

    # Design and implement the transformation as mentioned in pdf
    # You are free to implement any transformation but the comments at the top roughly describe
    # how you could implement two of them --- synonym replacement and typos.

    # You should update example["text"] using your transformation

    
    p_sub = 0.6
    words = example["text"].split()
    
    for i in range(len(words)):
        w = words[i]

        p = random.random()
        if p > p_sub:
            continue
        
        syns = wordnet.synsets(w)
        if len(syns) == 0:
            continue
        random_syn = random.choice(syns)
        lemmas = random_syn.lemmas()
        if len(lemmas) == 0:
            continue

        # further p_sub of the times, simulate
        # a gramatical error or add more diversity
        # by randomly selecting a lemma
        p = random.random()
        if p > p_sub:
            words[i] = lemmas[0].name()
        else:
            words[i] = random.choice(lemmas).name()

    example["text"] = ' '.join(words)
    ##### YOUR CODE ENDS HERE ######

    return example