2

There is very efficient assoc. array C language implementation used in php source code.

/*
 * HashTable Data Layout
 * =====================
 *
 *                 +=============================+
 *                 | HT_HASH(ht, ht->nTableMask) |
 *                 | ...                         |
 *                 | HT_HASH(ht, -1)             |
 *                 +-----------------------------+
 * ht->arData ---> | Bucket[0]                   |
 *                 | ...                         |
 *                 | Bucket[ht->nTableSize-1]    |
 *                 +=============================+
 */

Structures:

typedef struct _Bucket {
    zval              val;
    zend_ulong        h;                /* hash value (or numeric index)   */
    zend_string      *key;              /* string key or NULL for numerics */
} Bucket;

typedef struct _zend_array HashTable;

struct _zend_array {
    zend_refcounted_h gc;
    union {
        struct {
            ZEND_ENDIAN_LOHI_4(
                zend_uchar    flags,
                zend_uchar    _unused,
                zend_uchar    nIteratorsCount,
                zend_uchar    _unused2)
        } v;
        uint32_t flags;
    } u;
    uint32_t          nTableMask;
    Bucket           *arData;
    uint32_t          nNumUsed;
    uint32_t          nNumOfElements;
    uint32_t          nTableSize;
    uint32_t          nInternalPointer;
    zend_long         nNextFreeElement;
    dtor_func_t       pDestructor;
};

example function:

static zend_always_inline Bucket *zend_hash_find_bucket(const HashTable *ht, zend_string *key)
{
    zend_ulong h;
    uint32_t nIndex;
    uint32_t idx;
    Bucket *p, *arData;

    h = zend_string_hash_val(key);
    arData = ht->arData;
    nIndex = h | ht->nTableMask; //index calculation
    idx = HT_HASH_EX(arData, nIndex);
    while (EXPECTED(idx != HT_INVALID_IDX)) {
        p = HT_HASH_TO_BUCKET_EX(arData, idx);
        if (EXPECTED(p->key == key)) { /* check for the same interned string */
            return p;
        } else if (EXPECTED(p->h == h) &&
             EXPECTED(p->key) &&
             EXPECTED(ZSTR_LEN(p->key) == ZSTR_LEN(key)) &&
             EXPECTED(memcmp(ZSTR_VAL(p->key), ZSTR_VAL(key), ZSTR_LEN(key)) == 0)) {
            return p;
        }
        idx = Z_NEXT(p->val);
    }
    return NULL;
}

h is a big integer returned by a hash function.

The question is: Why index calculating this way?

nIndex = h | ht->nTableMask; //index calculation

Why not simple remainder of the division h integer on hashtable size?

nIndex = h & (ht->nTableSize - 1); //analog: nIndex = h % ht->nTableSize
legale
  • 612
  • 7
  • 9
  • a division is always more expensive than a bitwise or operator. The two operations you post are different.... A hash function has only one requirement: to spread hash values as much as possible and uniformly in the space of index keys. – Luis Colorado Dec 31 '18 at 12:05

2 Answers2

4

This is to make the number negative. The layout of the hash table is really brain-dead (Zend/zend_types.h):

/*
 * HashTable Data Layout
 * =====================
 *
 *                 +=============================+
 *                 | HT_HASH(ht, ht->nTableMask) |
 *                 | ...                         |
 *                 | HT_HASH(ht, -1)             |
 *                 +-----------------------------+
 * ht->arData ---> | Bucket[0]                   |
 *                 | ...                         |
 *                 | Bucket[ht->nTableSize-1]    |
 *                 +=============================+
 */

The ht->nTableMask is an integer that interpreted as 2's complement is negative, the intent is that by ORring with this, and converting to int32_t you get a negative offset from ht->arData. Then the ht->arData which is of type pointer to Bucket is cast to pointer to uint32_t and that pointer is indexed using negative indexes. I.e. all this dubious trickery exists to not need to have 2 pointers per hash table but using 1 that points to the middle of the data structure.

A modulo using AND and subtracting from ht->arData would have sufficed, and resulted in identical operation - it seems that this has been hand-optimized to be fast on some bad compiler.

1

NikiC wrote: This is basically the same as h & size -1, but going into the negative numbers instead of positives

Instead of setting the top bits to zero you set them to one So you get a negative number between -1 and -size

The mask is -(size << 1). Which is the same as ~(size << 1) + 1 or ~((size << 1) - 1)

So basically the same way you get the usual mask, but a) inverted because you want the top bits set not the bottom and b) shifted by one so the boundary is handled correctly

legale
  • 612
  • 7
  • 9