2

I'm having trouble getting dedupe to run. I am trying to use this library to remove duplicates from a huge set of addresses. Here is my code:

import collections
import logging
import optparse
from numpy import nan

import dedupe
from unidecode import unidecode

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = 'H:/My Documents/Python Scripts/Dedupe/DupeTester.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

def preProcess(column):

    import unidecode
    column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

def readData(filename):
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row[''])
            data_d[row_id] = dict(clean_row)

    return data_d


print 'importing data ...'
data_d = readData(input_file)

if os.path.exists(settings_file):
    print 'reading from', settings_file
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

else:
    fields = [
        {"field" : "fulladdr", "type" : "Address"},
        {"field" : "zip", "type" : "ShortString"},
             ]

deduper = dedupe.Dedupe(fields)

deduper.sample(data_d, 200)

if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

print 'starting active labeling...'

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf :
        deduper.writeTraining(tf)

with open(settings_file, 'w') as sf :
        deduper.writeSettings(sf)

print 'blocking...'



threshold = deduper.threshold(data_d, recall_weight=2)



print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)

print '# duplicate sets', len(clustered_dupes)




cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores) :
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
    writer = csv.writer(f_output)

    with open(input_file) as f_input :
        reader = csv.reader(f_input)

        heading_row = reader.next()
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = int(row[0])
            if row_id in cluster_membership :
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id]["canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)

Specifically I am getting the following when I run it:

C:\Anaconda\lib\site-packages\dedupe\core.py:18: UserWarning: There may be duplicates in the sample
  warnings.warn("There may be duplicates in the sample")
Traceback (most recent call last):

  File "<ipython-input-1-33e46d604c5f>", line 1, in <module>
    runfile('H:/My Documents/Python Scripts/Dedupe/dupetestscript.py', wdir='H:/My Documents/Python Scripts/Dedupe')

  File "C:\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
    execfile(filename, namespace)

  File "H:/My Documents/Python Scripts/Dedupe/dupetestscript.py", line 67, in <module>
    deduper.sample(data_d, 200)

  File "C:\Anaconda\lib\site-packages\dedupe\api.py", line 924, in sample
    random_sample_size))

TypeError: unhashable type: 'numpy.ndarray'
fgregg
  • 3,173
  • 30
  • 37
Connor M
  • 182
  • 2
  • 12
  • Are you sure your sample code is complete? The error message as about a numpy array, which your code doesn't use. The only thing it imports from numpy is NAN which isn't used. – Roland Smith Jan 16 '15 at 21:16
  • I can not reproduce it and the code is not simplified to the actual issue. – strpeter Sep 30 '15 at 15:08
  • 1
    @strpeter Not surprising as this was a bug in the code which was flagged and resolved. see here : https://github.com/datamade/dedupe/issues/336 – Connor M Sep 30 '15 at 16:54

1 Answers1

2

A numpy array can be changed (it is "mutable"). Python speeds up dictionary access by using the hash value of the key instead of the key.

So only hashable objects like numbers, strings or tuples can be used keys in a dictionary. From the Python glossary definition of hashable:

An object is hashable if it has a hash value which never changes during its lifetime (it needs a __hash__() method), and can be compared to other objects (it needs an __eq__() method). Hashable objects which compare equal must have the same hash value.

Hashability makes an object usable as a dictionary key and a set member, because these data structures use the hash value internally.

All of Python’s immutable built-in objects are hashable, while no mutable containers (such as lists or dictionaries) are. Objects which are instances of user-defined classes are hashable by default; they all compare unequal (except with themselves), and their hash value is derived from their id().

Roland Smith
  • 42,427
  • 3
  • 64
  • 94
  • That's unfortunate. What if you are working in Pandas and want to create a dict? Pandas is all numpy under the hood soooo... – tumultous_rooster Jun 11 '15 at 03:25
  • 1
    You can create dict keys *from immutable data* in a Pandas dataframe or numpy array. You just cannot use an array or dataframe itself as a *key*. You can use it as a value, though. – Roland Smith Jun 11 '15 at 19:47