5

I'm trying to write an md5 hash function in R without calling any C-routines. While my code executes just fine, the output never matches tools::md5sum (which does match examples provided in various online documents). I suspect a byte-order (or word-order) problem somewhere; as can be seen in the code provided below I have tried inserting some flipflops but without success. I have checked for simple mismatches in the output, such as the correct bytes but in the wrong order, but no luck.

To run this function, you'll need the libraries Rmpfr and bigBits as well as the functions provided here (flip32, flip16, and bigRotate ).

# need these to run:
library(Rmpfr)
library(bigBits)

mymd5 <- function(msg){

  if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32))) 

sidx <- c( 7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 )

#  K-values verified to be correct.
 K <- as.bigz(2^32 * abs(sin(1:64))) 

# IETF  shows these values as-is, i.e. don't change Endian on these calculations
#ietf.org page says A is 01234567, ie lower bytes first.  

adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'

# then optionally
adef <- flip32(adef)
bdef <- flip32(bdef)
cdef <- flip32(cdef)
ddef <- flip32(ddef)

a0 <- (base2base(adef,16,10))[[1]]
b0 <-(base2base(bdef,16,10))[[1]]
c0 <-(base2base(cdef,16,10))[[1]]
d0 <-(base2base(ddef,16,10))[[1]]

#  append 0x80  and pad with 0x00 bytes so that the message length in bytes ≡ 56 (mod 64).
  origlenbit <- length(msg) * 8
  msg <- c(msg, as.raw('0x80') )
  bytemod <- length(msg)%%64
  raw00 <- as.raw('0x00')
#  append  '0x00' 56-bytemod times
    if (bytemod <  56 && bytemod > 0 ) {
        msg <- c(msg,rep(raw00,times=(56-bytemod)))
    } else if (bytemod > 56) {
        msg <- c(msg, rep(raw00, times = bytemod-56))
    }
    
 # https://datatracker.ietf.org/doc/html/rfc1321#section-3.4 says
 #  A 64-bit representation of b (the length of the message before the
   # padding bits were added) is appended to the result of the previous
   # step. In the unlikely event that b is greater than 2^64, then only
   # the low-order 64 bits of b are used. (These bits are appended as two
   # 32-bit words and appended low-order word first in accordance with the
   # previous conventions.)
    len16 <-    base2base(origlenbit,10,16)[[1]]
    lenbit2 <- base2base(origlenbit,10,2)[[1]]
    if(nchar(lenbit2) < 64 ){
        lenbit2 <- paste0(c(rep('0',times = 64 -nchar(lenbit2)) ,lenbit2), collapse='')
    } else if (nchar(lenbit2) > 64) lenbit2 <- rev(rev(lenbit2)[1:64])
    newbytes2 <- substring(lenbit2, seq(1,nchar(lenbit2)-7,by=8), last = seq(8, nchar(lenbit2),by = 8) )    
    newdec <- base2base(newbytes2,2,10)
    newdbl <- rep(0,times= 8)
    for (jd in 1: 8) newdbl[jd] <- as.numeric(newdec[[jd]])
    newraw <- as.raw(newdbl)
#  which order is correct? 
#   msg <- c(msg,newraw[c(7,8,5,6,3,4,1,2)])
    msg <- c(msg, newraw[5:8], newraw[1:4])
#   msg <- c(msg, newraw[1:4], newraw[5:8])
    chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
    for(jchunk in 1: (length(msg)/64)) {  
        thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
#  should I reverse the bytes of this chunk? 
        thisM <- as.bigz(rep(0,16) )
        for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1) )
#       for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(1,256,256^2,256^3) )
      # Initialize hash value for this chunk:
         Ah<- a0
         Bh<- b0
         Ch<- c0
         Dh<- d0
#now the chunk loop, which I make into 4 sep'te loops 
        for (jone in 1:16) {
            Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
            g <- jone
            Fh<- (Fh + Ah + K[jone] + thisM[g]) %%two32  # truncate
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[jone], inTwo=FALSE)[[1]] )%%two32   
        }

        for (j2 in 17:32) {
            Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
            g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
            Fh<- (Fh + Ah + K[j2] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j2], inTwo=FALSE)[[1]])%%two32   
        }
        for(j3 in 33:48){
            Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
            g <-(5 + 3*(j3-1))%%16 +1
            Fh<- (Fh + Ah + K[j3] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j3],inTwo=FALSE)[[1]])%%two32 
        }
        for(j4 in 49:64){
            Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
            g <- (7 * (j4 -1))%%16 +1
            Fh<- (Fh + Ah + K[j4] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j4], inTwo=FALSE)[[1]])%%two32 
        }
           # Add this chunk's hash to result so far:
        a0<- (a0 + Ah)%%two32
        b0<- (b0 + Bh)%%two32
        c0<- (c0 + Ch)%%two32
        d0<- (d0 + Dh)%%two32
    }   # end of jchunk

#finally, string the values together.
# watch for those leading zeros
a0x <-(base2base(a0,10,16))
a0x <- paste0(c(rep(0,times=max(8-(nchar(a0x)),0)), a0x), collapse='')
b0x <-(base2base(b0,10,16))
b0x <- paste0(c(rep(0,times=max(8-(nchar(b0x)),0)), b0x), collapse='')
c0x <-(base2base(c0,10,16))
c0x <- paste0(c(rep(0,times=max(8-(nchar(c0x)),0)), c0x), collapse='')
d0x <-(base2base(d0,10,16))
d0x <- paste0(c(rep(0,times=max(8-(nchar(d0x)),0)), d0x), collapse='')


thesum <- paste0(c(a0x,b0x,c0x,d0x) , sep='',collapse='')
return(thesum)
} 

# helper func to test reordering bytes. e.g.  01234567 from 67452301

flip32 <- function(x){
    xsep <- unlist(strsplit(x,'') )
    xout <- paste0(c(xsep[c(7,8,5,6,3,4,1,2)]),collapse='')
    return(xout)
}
flip16 <- function(x) {
        xsep <- unlist(strsplit(x,''))
    xout <- paste0(c(xsep[c(3,4,1,2)]),collapse='')
    return(xout)
}


bigRotate <- function(x, shift,  inBase = 10,binSize = 32, outBase = 10, inTwosComp = TRUE) {
#default binary size is 32 to match bitwShiftR  
 shift = floor(shift[1])

 out <- rep('0',times = length(x))  # gmp::as.bigz(rep(0,times=length(x)))
    for(jj in 1:length(x)) {
        bintmp <- base2base(x[[jj]],inBase,2, binSize = binSize, inTwosComp = inTwosComp, outTwosComp = FALSE)[[1]]
#now all inputs will have a neg sign if negative.
        isPos = TRUE
        if(length(grep('-',bintmp)) ) {
            isPos = FALSE
            bintmp <- gsub('^-','',bintmp)
        }
        bintmp <- strsplit(bintmp,'')[[1]]
        xlen = length(bintmp)
    # now rotate the bits.  
        otmp  <- unlist(paste0(c(bintmp[ ((1:xlen) + shift -1) %%(xlen) + 1]),collapse=''))
        out[jj] <- base2base(otmp, 2, outBase, binSize=binSize, outTwosComp = FALSE, classOut = "character")[[1]]
        if(!isPos) out[jj] <- paste0('-',out[jj], collapse='')
    }
    # provide output in source 'inBase'
    if(is(x,'mpfr') || is(x,'mpfr1')) {
        out <- Rmpfr::.bigz2mpfr(gmp::as.bigz(out))
    } else if(is(x,'numeric')) {
        out <- as.numeric(out)
    } else if(is(x,'bigz')) out <- gmp::as.bigz(out)
    
 return(out) 
}
Carl Witthoft
  • 20,573
  • 9
  • 43
  • 73

1 Answers1

5

There are a few issues which are listed below.

1. Removed flip32

2. Issue with appending the padding bits

It has been rewritten as

msg <- c(msg, as.raw('0x80') )
while (length(msg) %% 64 != 56) {
  msg <- c(msg, as.raw(0))
}

3. The rows of the thischunk matrix needed to be reversed.

thischunk <- thischunk[nrow(thischunk):1,]

4. Issue with bigRotate

This can be expressed as ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF so it has been rewritten as

bigRotate <- function(x, amount) {
  two32 <- as.bigz(2^(as.bigz(32)))
  x <- x %% two32
  lshift <- bigShiftL(x, amount)
  rshift <- bigShiftR(x, 32-amount)
  
  # bigOr errors if either value is zero so handle those cases explicitly
  if (lshift == 0) {
    rshift
  } else if (rshift == 0) {
    lshift
  } else {
    bigOr(lshift, rshift) %% two32
  }
}

5. Combining a, b, c, and d was rewritten and the endianness was swapped.

swap_endianness <- function(hex) {
  chars <- strsplit(hex, "")[[1]]
  pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
  paste0(rev(pairs), collapse = "")
}

thesum <- sum(
    bigShiftL(a0, 32 * 0),
    bigShiftL(b0, 32 * 1),
    bigShiftL(c0, 32 * 2),
    bigShiftL(d0, 32 * 3)
)
hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
output <- swap_endianness(hex)

Corrected code

Implementing the above changes yields

library(bigBits)
library(gmp)


mymd5 <- function(msg){
  if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
  
  # a const to be used when truncating values > ffffffff
  two32 <- as.bigz(2^(as.bigz(32)))
  sidx <- c(7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23, 4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21)
  K <- as.bigz(2^32 * abs(sin(1:64))) 
  
  adef <- '67452301'
  bdef <- 'efcdab89'
  cdef <- '98badcfe'
  ddef <- '10325476'
  
  a0 <- base2base(adef, 16, 10)[[1]]
  b0 <- base2base(bdef, 16, 10)[[1]]
  c0 <- base2base(cdef, 16, 10)[[1]]
  d0 <- base2base(ddef, 16, 10)[[1]]

  origlenbit <- length(msg) * 8
  
  msg <- c(msg, as.raw('0x80') )
  while (length(msg) %% 64 != 56) {
    msg <- c(msg, as.raw(0))
  }

  length_bytes <- writeBin(as.integer(origlenbit), raw(), size = 8, endian = "little")
  msg <- c(msg, length_bytes, rep(0, length.out = 8 - length(length_bytes)))
  
  chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
  
  ##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
  for(jchunk in 1: (length(msg)/64)) {  
    thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
    thischunk <- thischunk[nrow(thischunk):1,]
    thisM <- as.bigz(rep(0,16) )
    for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1))

    # Initialize hash value for this chunk:
    Ah <- a0
    Bh <- b0
    Ch <- c0
    Dh <- d0
    
    #now the chunk loop, which I make into 4 sep'te loops 
    for (jone in 1:16) {
      Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
      g <- jone
      Fh <- (Fh + Ah + K[jone] + thisM[g]) %% two32
      Ah <- Dh
      Dh <- Ch
      Ch <- Bh
      Bh <- (Bh + bigRotate(Fh, sidx[jone])) %% two32   
    }
    
    for (j2 in 17:32) {
      Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
      g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
      Fh <- (Fh + Ah + K[j2] + thisM[g]) %% two32  
      Ah <- Dh
      Dh <- Ch
      Ch <- Bh
      Bh <- (Bh + bigRotate(Fh, sidx[j2])) %% two32
    }
    for(j3 in 33:48){
      Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
      g <-(5 + 3*(j3-1))%%16 +1
      Fh<- (Fh + Ah + K[j3] + thisM[g]) %% two32  
      Ah<- Dh
      Dh<- Ch
      Ch<- Bh
      Bh<- (Bh + bigRotate(Fh, sidx[j3]))%%two32 
    }
    for(j4 in 49:64){
      Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
      g <- (7 * (j4 -1))%%16 +1
      Fh<- (Fh + Ah + K[j4] + thisM[g]) %% two32  
      Ah<- Dh
      Dh<- Ch
      Ch<- Bh
      Bh<- (Bh + bigRotate(Fh, sidx[j4])) %% two32 
    }
    
    # Add this chunk's hash to result so far:
    a0 <- (a0 + Ah) %% two32
    b0 <- (b0 + Bh) %% two32
    c0 <- (c0 + Ch) %% two32
    d0 <- (d0 + Dh) %% two32
  }   # end of jchunk

  thesum <- sum(
    bigShiftL(a0, 32 * 0),
    bigShiftL(b0, 32 * 1),
    bigShiftL(c0, 32 * 2),
    bigShiftL(d0, 32 * 3)
  )
  
  hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
  swap_endianness(hex)
}


# ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF
bigRotate <- function(x, amount) {
  two32 <- as.bigz(2^(as.bigz(32)))
  x <- x %% two32
  lshift <- bigShiftL(x, amount)
  rshift <- bigShiftR(x, 32-amount)
  
  # bigOr errors if either value is zero so handle those cases explicitly
  if (lshift == 0) {
    rshift
  } else if (rshift == 0) {
    lshift
  } else {
    bigOr(lshift, rshift) %% two32
  }
}


swap_endianness <- function(hex) {
  chars <- strsplit(hex, "")[[1]]
  pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
  paste0(rev(pairs), collapse = "")
}

Which passes a handful of unit tests

library(testthat)

test_that("md5 works", {
  expect_equal(mymd5(""), "d41d8cd98f00b204e9800998ecf8427e")
  expect_equal(mymd5("a"), "0cc175b9c0f1b6a831c399e269772661")
  expect_equal(mymd5("abc"), "900150983cd24fb0d6963f7d28e17f72")
  expect_equal(mymd5("message digest"), "f96b697d7cb7938d525a2f31aaf161d0")
  expect_equal(mymd5("abcdefghijklmnopqrstuvwxyz"), "c3fcd3d76192e4007dfb496cca67e13b")
  expect_equal(mymd5("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), "d174ab98d277d9f5a5611c2c9f419d9f")
  expect_equal(mymd5("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), "57edf4a22be3c955ac49da2e2107b67a")
})
#> Test passed
Paul
  • 8,734
  • 1
  • 26
  • 36
  • 1
    Wow! thanks! I had caught a couple of those (like reversing the bytes in `thischunk`, but your amazing attention to detail is impressive. I have to review your streamlined `bigRotate` because I want that function to be generic, i.e. works for arbitrary-sized values, not just 32-bit. – Carl Witthoft Jul 13 '23 at 14:20
  • A quick followup note for those "scraping code" -- this `bigRotate`, when modified to allow for arbitrary size binaries, runs far slower than the code I provided in the question. They both produce the same, correct, output. – Carl Witthoft Jul 24 '23 at 13:45