I'm trying to write an md5 hash function in R
without calling any C-routines. While my code executes just fine, the output never matches tools::md5sum
(which does match examples provided in various online documents). I suspect a byte-order (or word-order) problem somewhere; as can be seen in the code provided below I have tried inserting some flipflops but without success.
I have checked for simple mismatches in the output, such as the correct bytes but in the wrong order, but no luck.
To run this function, you'll need the libraries Rmpfr
and bigBits
as well as the functions provided here (flip32
, flip16
, and bigRotate
).
# need these to run:
library(Rmpfr)
library(bigBits)
mymd5 <- function(msg){
if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32)))
sidx <- c( 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22 , 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20 , 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23 , 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21 )
# K-values verified to be correct.
K <- as.bigz(2^32 * abs(sin(1:64)))
# IETF shows these values as-is, i.e. don't change Endian on these calculations
#ietf.org page says A is 01234567, ie lower bytes first.
adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'
# then optionally
adef <- flip32(adef)
bdef <- flip32(bdef)
cdef <- flip32(cdef)
ddef <- flip32(ddef)
a0 <- (base2base(adef,16,10))[[1]]
b0 <-(base2base(bdef,16,10))[[1]]
c0 <-(base2base(cdef,16,10))[[1]]
d0 <-(base2base(ddef,16,10))[[1]]
# append 0x80 and pad with 0x00 bytes so that the message length in bytes ≡ 56 (mod 64).
origlenbit <- length(msg) * 8
msg <- c(msg, as.raw('0x80') )
bytemod <- length(msg)%%64
raw00 <- as.raw('0x00')
# append '0x00' 56-bytemod times
if (bytemod < 56 && bytemod > 0 ) {
msg <- c(msg,rep(raw00,times=(56-bytemod)))
} else if (bytemod > 56) {
msg <- c(msg, rep(raw00, times = bytemod-56))
}
# https://datatracker.ietf.org/doc/html/rfc1321#section-3.4 says
# A 64-bit representation of b (the length of the message before the
# padding bits were added) is appended to the result of the previous
# step. In the unlikely event that b is greater than 2^64, then only
# the low-order 64 bits of b are used. (These bits are appended as two
# 32-bit words and appended low-order word first in accordance with the
# previous conventions.)
len16 <- base2base(origlenbit,10,16)[[1]]
lenbit2 <- base2base(origlenbit,10,2)[[1]]
if(nchar(lenbit2) < 64 ){
lenbit2 <- paste0(c(rep('0',times = 64 -nchar(lenbit2)) ,lenbit2), collapse='')
} else if (nchar(lenbit2) > 64) lenbit2 <- rev(rev(lenbit2)[1:64])
newbytes2 <- substring(lenbit2, seq(1,nchar(lenbit2)-7,by=8), last = seq(8, nchar(lenbit2),by = 8) )
newdec <- base2base(newbytes2,2,10)
newdbl <- rep(0,times= 8)
for (jd in 1: 8) newdbl[jd] <- as.numeric(newdec[[jd]])
newraw <- as.raw(newdbl)
# which order is correct?
# msg <- c(msg,newraw[c(7,8,5,6,3,4,1,2)])
msg <- c(msg, newraw[5:8], newraw[1:4])
# msg <- c(msg, newraw[1:4], newraw[5:8])
chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
## turn groups of 512 BITS of msg into 16 words of 32 bits each.
for(jchunk in 1: (length(msg)/64)) {
thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
# should I reverse the bytes of this chunk?
thisM <- as.bigz(rep(0,16) )
for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1) )
# for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(1,256,256^2,256^3) )
# Initialize hash value for this chunk:
Ah<- a0
Bh<- b0
Ch<- c0
Dh<- d0
#now the chunk loop, which I make into 4 sep'te loops
for (jone in 1:16) {
Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
g <- jone
Fh<- (Fh + Ah + K[jone] + thisM[g]) %%two32 # truncate
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[jone], inTwo=FALSE)[[1]] )%%two32
}
for (j2 in 17:32) {
Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]]
g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
Fh<- (Fh + Ah + K[j2] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j2], inTwo=FALSE)[[1]])%%two32
}
for(j3 in 33:48){
Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
g <-(5 + 3*(j3-1))%%16 +1
Fh<- (Fh + Ah + K[j3] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j3],inTwo=FALSE)[[1]])%%two32
}
for(j4 in 49:64){
Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
g <- (7 * (j4 -1))%%16 +1
Fh<- (Fh + Ah + K[j4] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j4], inTwo=FALSE)[[1]])%%two32
}
# Add this chunk's hash to result so far:
a0<- (a0 + Ah)%%two32
b0<- (b0 + Bh)%%two32
c0<- (c0 + Ch)%%two32
d0<- (d0 + Dh)%%two32
} # end of jchunk
#finally, string the values together.
# watch for those leading zeros
a0x <-(base2base(a0,10,16))
a0x <- paste0(c(rep(0,times=max(8-(nchar(a0x)),0)), a0x), collapse='')
b0x <-(base2base(b0,10,16))
b0x <- paste0(c(rep(0,times=max(8-(nchar(b0x)),0)), b0x), collapse='')
c0x <-(base2base(c0,10,16))
c0x <- paste0(c(rep(0,times=max(8-(nchar(c0x)),0)), c0x), collapse='')
d0x <-(base2base(d0,10,16))
d0x <- paste0(c(rep(0,times=max(8-(nchar(d0x)),0)), d0x), collapse='')
thesum <- paste0(c(a0x,b0x,c0x,d0x) , sep='',collapse='')
return(thesum)
}
# helper func to test reordering bytes. e.g. 01234567 from 67452301
flip32 <- function(x){
xsep <- unlist(strsplit(x,'') )
xout <- paste0(c(xsep[c(7,8,5,6,3,4,1,2)]),collapse='')
return(xout)
}
flip16 <- function(x) {
xsep <- unlist(strsplit(x,''))
xout <- paste0(c(xsep[c(3,4,1,2)]),collapse='')
return(xout)
}
bigRotate <- function(x, shift, inBase = 10,binSize = 32, outBase = 10, inTwosComp = TRUE) {
#default binary size is 32 to match bitwShiftR
shift = floor(shift[1])
out <- rep('0',times = length(x)) # gmp::as.bigz(rep(0,times=length(x)))
for(jj in 1:length(x)) {
bintmp <- base2base(x[[jj]],inBase,2, binSize = binSize, inTwosComp = inTwosComp, outTwosComp = FALSE)[[1]]
#now all inputs will have a neg sign if negative.
isPos = TRUE
if(length(grep('-',bintmp)) ) {
isPos = FALSE
bintmp <- gsub('^-','',bintmp)
}
bintmp <- strsplit(bintmp,'')[[1]]
xlen = length(bintmp)
# now rotate the bits.
otmp <- unlist(paste0(c(bintmp[ ((1:xlen) + shift -1) %%(xlen) + 1]),collapse=''))
out[jj] <- base2base(otmp, 2, outBase, binSize=binSize, outTwosComp = FALSE, classOut = "character")[[1]]
if(!isPos) out[jj] <- paste0('-',out[jj], collapse='')
}
# provide output in source 'inBase'
if(is(x,'mpfr') || is(x,'mpfr1')) {
out <- Rmpfr::.bigz2mpfr(gmp::as.bigz(out))
} else if(is(x,'numeric')) {
out <- as.numeric(out)
} else if(is(x,'bigz')) out <- gmp::as.bigz(out)
return(out)
}