Maybe something like this:
lbu $10, matrix
lbu $11, matrix+1
lbu $12, matrix+2
lbu $13, matrix+3
lbu $14, matrix+4
lbu $15, matrix+5
lbu $16, matrix+6
lbu $17, matrix+7
lbu $18, matrix+8
lbu $19, matrix+9
lbu $20, matrix+10
lbu $21, matrix+11
lbu $22, matrix+12
lbu $23, matrix+13
lbu $24, matrix+14
lbu $25, matrix+15
addiu $2, $0, 8
addiu $9, $0, 256
loop:
addiu $2, $2, -1
srl $9, $9, 1
addu $27, $0, $0
and $26, $10, $9
srlv $26, $26, $2
or $27, $27, $26
and $26, $11, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $12, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $13, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $14, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $15, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $16, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $17, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $18, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $19, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $20, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $21, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $22, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $23, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $24, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $25, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
sll $3, $2, 1
sh $27, transposed($3)
bgez $2, loop
nop
.data 0x2000
matrix:
.byte 0x80
.byte 0x80
.byte 0x40
.byte 0x40
.byte 0x20
.byte 0x20
.byte 0x10
.byte 0x10
.byte 0x08
.byte 0x08
.byte 0x04
.byte 0x04
.byte 0x02
.byte 0x02
.byte 0x01
.byte 0x01
.data 0x3000
transposed:
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
It reads the input matrix and then perform a loop 8 times (once for each transposed matrix row).