2

I am trying to modify the fuzzywuzzy library. The module process returns the score and the array element. But I want it to return the index of the element along with the group of score,item,index.
Here is what I tried:

#!/usr/bin/env python
# encoding: utf-8
from fuzzywuzzy import fuzz
from fuzzywuzzy import utils
import heapq
import logging
from functools import partial


default_scorer = fuzz.WRatio


default_processor = utils.full_process


def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
    def no_process(x):
        return x

    try:
        if choices is None or len(choices) == 0:
            raise StopIteration
    except TypeError:
        pass

    if processor is None:
        processor = no_process

    processed_query = processor(query)

    if len(processed_query) == 0:
        logging.warning(u"Applied processor reduces input query to empty string, "
                        "all comparisons will have score 0. "
                        "[Query: \'{0}\']".format(query))

    # Don't run full_process twice
    if scorer in [fuzz.WRatio, fuzz.QRatio,
                  fuzz.token_set_ratio, fuzz.token_sort_ratio,
                  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
                  fuzz.UWRatio, fuzz.UQRatio] \
            and processor == utils.full_process:
        processor = no_process

    # Only process the query once instead of for every choice
    if scorer in [fuzz.UWRatio, fuzz.UQRatio]:
        pre_processor = partial(utils.full_process, force_ascii=False)
        scorer = partial(scorer, full_process=False)
    elif scorer in [fuzz.WRatio, fuzz.QRatio,
                    fuzz.token_set_ratio, fuzz.token_sort_ratio,
                    fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio]:
        pre_processor = partial(utils.full_process, force_ascii=True)
        scorer = partial(scorer, full_process=False)
    else:
        pre_processor = no_process
    processed_query = pre_processor(processed_query)
    count = -1
    try:
        # See if choices is a dictionary-like object.
        for key, choice in choices.items():
            count = count + 1
            processed = pre_processor(processor(choice))
            score = scorer(processed_query, processed)
            if score >= score_cutoff:
                yield (choice, score, key,count)
    except AttributeError:
        # It's a list; just iterate over it.
        for choice in choices:
            count = count + 1
            processed = pre_processor(processor(choice))
            score = scorer(processed_query, processed)

            if score >= score_cutoff:
                yield (choice, score,count)


def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
    sl = extractWithoutOrder(query, choices, processor, scorer)
    return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
        sorted(sl, key=lambda i: i[1], reverse=True)

When I tried to implement it, the result was what it was previously showing by fuzzywuzzy.

import process as p
box=['ness', 'apple','banana','carrot','duck','eagle','fish','gate','hitler']
p.extract('b',box)
[('banana', 90), ('apple', 0), ('carrot', 0), ('duck', 0), ('eagle', 0)]

But what I am expecting it to return is:

[('banana', 90, 2), ('apple', 0, 1), ('carrot', 0, 3), ('duck', 0, 4), ('eagle', 0, 5)]

Kindly let me know the suggestion.

Jaffer Wilson
  • 7,029
  • 10
  • 62
  • 139

2 Answers2

2

As an alternative to FuzzyWuzzy you could use RapidFuzz (I am the author) which will return the index as well:

from rapidfuzz import process

box=['ness', 'apple','banana','carrot','duck','eagle','fish','gate','hitler']
p.extract('b',box)

which returns

[('banana', 90, 2), ('apple', 0, 1), ('carrot', 0, 3), ('duck', 0, 4), ('eagle', 0, 5)]
maxbachmann
  • 2,862
  • 1
  • 11
  • 35
1

For those looking for the answer, can pass a dictionary to the process.

from fuzzywuzzy import fuzz    
from fuzzywuzzy import process

box = ['apple','banana','carrot','duck','eagle']
box_dict = {i: val for i, val in enumerate(box)}

process.extract("b", box_dict, scorer=fuzz.WRatio)
# O/P -> [("banana", 90, 1), ('apple', 0, 0), ('carrot', 0, 2), ('duck', 0, 3), ('eagle', 0, 4)]
thisisbhavin
  • 344
  • 3
  • 15