1
typedef unsigned long Count;
typedef float Weight;
typedef std::map<std::string, Count> StringToCountMap;
typedef std::map<std::string, Weight> StringToWeightMap;
typedef std::map<unsigned long, StringToCountMap> UnsignedToStringToCountMap;
typedef std::map<unsigned long, StringToWeightMap> UnsignedToStringToWeightMap;

typedef std::map<unsigned long, std::size_t> ClustersMap;


class DefaultClusteringAlgorithm
{
public:
    // minumum number of documents changing clusters for algorithm to end
    static const unsigned DocumentChangeThreshold = 0;

    DefaultClusteringAlgorithm(unsigned numClusters, const UnsignedToStringToWeightMap &documentVectors)
        : numClusters_(numClusters)
        , documentVectors_(documentVectors)
    {
    }

~DefaultClusteringAlgorithm() {}

const ClustersMap &DoClustering();

private:
    void ChooseInitialCentroids();
    unsigned ClusterOnCentroids();
    void RecalculateCentroids();
    float DocumentDotProduct(const StringToWeightMap &left, const StringToWeightMap &right);
    float DocumentLength(const StringToWeightMap &document);

    unsigned numClusters_;

    // stores cluster_id => centroid
    std::vector<StringToWeightMap> centroids_;

    // maps question id => cluster id
    ClustersMap clusters_;

    // document vector
    const UnsignedToStringToWeightMap &documentVectors_;
};

void DefaultClusteringAlgorithm::RecalculateCentroids()
{
    std::vector<unsigned> newCentroidsSizes(centroids_.size());
    std::vector<StringToWeightMap> newCentroids(centroids_.size());

    ClustersMap::const_iterator clusterMapping = clusters_.begin();

    for (; clusterMapping != clusters_.end(); ++clusterMapping)
    {
        std::size_t clusterId = clusterMapping->second;

        ++newCentroidsSizes[clusterId];
        const StringToWeightMap &document = documentVectors_.at(clusterMapping->first);

        StringToWeightMap::const_iterator termWeight = document.cbegin();

        for (; termWeight != document.end(); ++termWeight);
        {
            newCentroids[clusterId][termWeight->first] += termWeight->second;
        }
    }

    std::vector<unsigned>::iterator centroidSize = newCentroidsSizes.begin();

    for (; centroidSize != newCentroidsSizes.end(); ++centroidSize)
    {
        std::size_t clusterId = centroidSize - newCentroidsSizes.begin();

        StringToWeightMap::iterator centroidTermWeight = newCentroids[clusterId].begin();

        for (; centroidTermWeight != newCentroids[clusterId].end(); ++centroidTermWeight)
        {
            centroidTermWeight->second /= *centroidSize;
        }
    }
}

debugger watch

The problem occurs in creating the const_iterator termWeight:

StringToWeightMap::const_iterator termWeight = document.begin();

As you can see in the image above the termWeight const_iterator has invalid data. However, the const std::map document is a perfectly valid std::map. I cannot think of any reason why this is happening.

I recently learned that std::map::cbegin() exists. Should I be using that method instead?

EDIT: Included more context

Osuvaldo
  • 274
  • 3
  • 10
  • Which line of code is about to be executed when it shows the above memory dump? The data displayed for `termWeight` is invalid until after the line where it's first assigned. – Brian Cain Nov 29 '13 at 03:55
  • `cbegin` won't help. Are there any `const_cast`s in your code base? One way to get really bad behavior out of a `std::map` is to reorder the contents of the `map` while they are in the `map`. Second, what line are you on? Has any iteration occurred? Are there any other threads? What is the type of `documentVectors_`? – Yakk - Adam Nevraumont Nov 29 '13 at 03:55
  • The next line to be executed is the single line in the inner-most for loop. So termWeight has already been assigned. – Osuvaldo Nov 29 '13 at 03:57
  • There are no other threads. I'll edit the question to show the type of documentVectors_ – Osuvaldo Nov 29 '13 at 03:58
  • If you're going to use `cbegin()`, you should be using `cend()` as well. – John Saxton Nov 29 '13 at 04:48
  • @JohnSaxton I believe that is only a matter of style, and is *not* the OP's problem regardless. – Yakk - Adam Nevraumont Nov 29 '13 at 04:56
  • You almost certainly want to disable `operator=` on `DefaultClusteringAlgorithm` -- failure to do so may result in subtle bugs. Any optimizations enabled? If you change some of your references to copies, does the problem go away? What happens when you "open" `document` in the debugging window, does its contents make sense? – Yakk - Adam Nevraumont Nov 29 '13 at 04:58
  • Thanks, Yakk. That is certainly something to consider. This is a small application for a school project and I am certain that `DefaultClusteringAlgorithm` will never be copied or assigned. As I stated in an answer below, the problem was a rogue semi-colon at the end of the `for` statement right after `termWeight` is created. Such an easy thing to overlook! – Osuvaldo Nov 29 '13 at 05:25

2 Answers2

2

Hah! I found the error! A silly little semi-colon at the end of my for loop!

Osuvaldo
  • 274
  • 3
  • 10
0

The std::map begin() method could be returning an iterator that is pointing to the end of map since there may not be any elements in the map at all.

Andrew
  • 1,581
  • 3
  • 18
  • 31