OpenAI has recently announced a new AI classifier for indicating AI-written text, that is available as a tool here. According to the Model Card this classifier categorize five different classes as follows:
- "Very unlikely to be AI-generated" corresponds to a classifier threshold of <0.1.
- "Unlikely to be AI-generated" corresponds to a classifier threshold between 0.1 and 0.45.
- "Unclear if it is AI written" corresponds to a classifier threshold between 0.45 and 0.9.
- "Possibly AI-generated" corresponds to a classifier threshold between 0.9 and 0.98.
- "Likely AI-generated" corresponds to a classifier threshold >0.98.
But when I try it a get in the output the logprobs of specific tokens that seems to map the classes above. This is an example of log probs in the case of ai-generated text:
{
"top_logprobs": [
{
"!": -3.4727178,
"\"": -0.033285,
" \"": -8.678962,
"\".": -10.159605,
"!\"": -8.384942
}
]
}
while this is an example of log probs in the case of human-generated text:
{
"top_logprobs": [
{
"!": -0.0272431,
"\"": -3.7918115,
"!\n\n": -8.135782,
"!!": -8.312574,
"”": -8.102049
}
]
}
to read these easily we could eventually convert log probs into probs doing like
log_probs = response.choices[0].logprobs.top_logprobs[0];
probs = Object.keys(log_probs).map(v => ({ label: v, log_prob: log_probs[v], prob: Math.exp(log_probs[v]) }));
probs = probs.sort((a,b) => (b.prob - a.prob));
Hence getting for AI-Generated Text
[
{
"label": "\"",
"log_prob": -0.033285,
"prob": 0.9672628503899762
},
{
"label": "!",
"log_prob": -3.4727178,
"prob": 0.03103257561308009
},
{
"label": "!\"",
"log_prob": -8.384942,
"prob": 0.0002282789983770749
},
{
"label": " \"",
"log_prob": -8.678962,
"prob": 0.00017012756839526099
},
{
"label": "\".",
"log_prob": -10.159605,
"prob": 0.00003870255265601741
}
]
and for Human-generated text:
[
{
"label": "!",
"log_prob": -0.0272431,
"prob": 0.9731246461658533
},
{
"label": "\"",
"log_prob": -3.7918115,
"prob": 0.022554706970090203
},
{
"label": "”",
"log_prob": -8.102049,
"prob": 0.00030291782313947616
},
{
"label": "!\n\n",
"log_prob": -8.135782,
"prob": 0.000292869921924322
},
{
"label": "!!",
"log_prob": -8.312574,
"prob": 0.00024541154037504843
}
]
so it seems that
- the token
"!"
is related to the classes"Unlikely to be AI-generated"
(>0.1 and <0.45) and"Very unlikely to be AI-generated"
(<0.1) - the token
"\""
seems to be related to"Likely AI-generated"
(>0.98) and"Possibly AI-generated"
(>0.9 and <0.98)
For all other probability values ( >0.45 and <0.9 ) the label would be "Unclear if it is AI written"
.
So to get the label and its prob I did this function where I filter out all tokens but "!"
and "\""
, turn log probs into prob, sort and assign the label description:
let getProbabilities = function(log_probs) {
var probs = Object.keys(log_probs).filter(v => v==="!" || v==="\"").map(v => ({ label: v, log_prob: log_probs[v], prob: Math.exp(log_probs[v]) }));
probs = probs.sort((a,b) => (b.prob - a.prob));
var predicted = probs[0].label;
probs = probs.map(pred => {
if(pred.label==="!" && predicted==pred.label) { // Human-generated
if((1-pred.prob)>0.1 && (1-pred.prob)<0.45) pred.desc='Unlikely to be AI-generated'
else if((1-pred.prob)<0.1) pred.desc='Very unlikely to be AI-generated'
}
else if(pred.label==="\"" && predicted==pred.label) { // AI-generated
if(pred.prob>0.98) pred.desc='Likely AI-generated'
else if(pred.prob>0.9 && pred.prob<0.98) pred.desc='Possibly AI-generated'
else if(pred.prob>0.45 && pred.prob<0.9) pred.desc='Unclear if it is AI written'
}
return pred;
});
return probs;
}//getProbabilities
so I get
log_probs = res.choices[0].logprobs.top_logprobs[0];
probs = getProbabilities(log_probs);
[
{
"label": "\"",
"log_prob": -0.033285,
"prob": 0.9672628503899762,
"desc": "Possibly AI-generated"
},
{
"label": "!",
"log_prob": -3.4727178,
"prob": 0.03103257561308009
}
]
and
log_probs = res.choices[0].logprobs.top_logprobs[0];
probs = getProbabilities(log_probs);
[
{
"label": "!",
"log_prob": -0.0272431,
"prob": 0.9731246461658533,
"desc": "Very unlikely to be AI-generated"
},
{
"label": "\"",
"log_prob": -3.7918115,
"prob": 0.022554706970090203
}
]
I'm not sure if the assumptions I did in getProbabilities
are correct, is out there any better way?