perplexity.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import nltk
  2. from nltk.lm.preprocessing import padded_everygram_pipeline
  3. from nltk.lm import MLE
  4. from nltk import FreqDist
  5. from utils import *
  6. from burstiness import *
  7. msg = ""
  8. def get_perplexity(text):
  9. train_sentences = [get_answer(text)]
  10. tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent)))
  11. for sent in train_sentences]
  12. n = 1
  13. train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
  14. model = MLE(n)
  15. model.fit(train_data, padded_vocab)
  16. test_sentences = [text]
  17. tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent)))
  18. for sent in test_sentences]
  19. test_data, _ = padded_everygram_pipeline(n, tokenized_text)
  20. for i, test in enumerate(test_data):
  21. n = float(model.perplexity(test))
  22. try:
  23. if n != float('inf'):
  24. score = n / get_burstiness(text)
  25. global msg
  26. if score < 51:
  27. msg = ("Your text is more likely to be generate by an AI since your score was: {0}".format(100 - (score / 2)))
  28. else:
  29. msg = ("Your text is more likely to be generate by a human since your score was: {0}".format((score / 2)))
  30. else:
  31. get_perplexity(text)
  32. except:
  33. get_perplexity(text)
  34. def return_msg():
  35. return msg