0

I can not figure out where I'm having my problem with my heap sort.
The program takes a filename from the command line, imports the words into a vector then that vector is turned into a vector pair of vector<string,int> where string is the word and int is the count of how many instances of that word are in the file.

The vector<PAIR> is then sorted by either the string (value or v) or by int (key or k). My sorting by Key works fine however sort by value is off. I suspect I'm missing an if statement in max_heapify when sorting by value. Here's my code:

main.cpp

#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <vector>
#include <string>
#include <string.h>
#include <stdio.h>
#include <map>
#include <time.h>
#include "readwords.h"

using namespace std;

readwords wordsinfile;
vector<string> allwords;
bool times;
char *filename;
timespec timestart,timeend;
vector< pair<string,int> > allwords_vp;

timespec diffclock(timespec start, timespec end);

int main ( int argc, char *argv[] ) {

    filename = argv[1];

    //Lets open the file 
    ifstream ourfile2(filename);

    //Lets get all the words using our requirements
    allwords = wordsinfile.getwords(ourfile2);
    //Convert all the words from file and count how many times they 
    //appear.  We will store them in a vector<string,int> string 
    //being the word and int being how many time the word appeared
    allwords_vp = wordsinfile.count_vector(allwords);

    cout << "HeapSort by Values" << endl;
        if (times) {
        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &timestart);
                wordsinfile.heapsort(const_cast<char *>("v"));
        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &timeend);
                cout << "HeapSort by Values ran in "
                     << diffclock(timestart,timeend).tv_nsec << " nanosecond or "
                     << diffclock(timestart,timeend).tv_nsec/1000 << " millisecond"
                     << endl;
        } else {
                wordsinfile.heapsort(const_cast<char *>("v"));
        }

    cout << "HeapSort by Keys" << endl;
        if (times) {
        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &timestart);
                wordsinfile.heapsort(const_cast<char *>("k"));
        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &timeend);
                cout << "HeapSort by Keys ran in "
                     << diffclock(timestart,timeend).tv_nsec << " nanosecond or "
                     << diffclock(timestart,timeend).tv_nsec/1000 << " millisecond"
                     << endl;
        } else {
                wordsinfile.heapsort(const_cast<char *>("k"));
        }
}

timespec diffclock(timespec start, timespec end) {
    timespec temp;
    if ((end.tv_nsec-start.tv_nsec)<0) {
        temp.tv_sec = end.tv_sec-start.tv_sec-1;
        temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
    } else {
        temp.tv_sec = end.tv_sec-start.tv_sec;
        temp.tv_nsec = end.tv_nsec-start.tv_nsec;
    }
    return temp;
}

readwords.h

#ifndef READWORDS_H
#define READWORDS_H

#include <vector>
#include <map>
#include <utility>
#include <time.h>

typedef std::pair<std::string, int> PAIR;

bool isasciifile(std::istream& file);

class readwords {
    private:
         std::vector<PAIR> vp;
    public:
         std::vector<std::string> getwords(std::istream& file);
         std::vector<PAIR> count_vector(std::vector<std::string> sv);
         void print_vectorpair(std::vector<PAIR> vp);
         void print_vector(std::vector<std::string> sv);     
         void heapsort(char how[]);
         void buildmaxheap(std::vector<PAIR> &vp, int heapsize, char how[]);
         void max_heapify(std::vector<PAIR> &vp, int i, int heapsize, char how[]);
         void swap_pair(PAIR &p1, PAIR &p2);
};

readwords.cpp

#include <fstream>
#include <iostream>
#include <map>
#include "readwords.h"
#include <vector>
#include <string>
#include <utility>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>

//using std::vector;
using namespace std;

typedef pair<string, int> PAIR;

// Do we have a ASCII file?
// Lets test the second 10 chars to make sure
// This method is flawed if the file is less than 10 chars
bool isasciifile(std::istream& file) {
    int c = 0;
    bool foundbin = false;
    for(c=0; c < 10;c++) {
        if(!isprint(file.get())){
            // Looks like we found a non ASCII file, or its empty.
            foundbin = true;
        }
    }
    return foundbin;
}

// This is our workhorse as it splits up the words based on our criteria and
// passes them back as a vector of strings.
vector<string> readwords::getwords(std::istream& file) {
    char c;
    string aword;
    vector<string> sv;

            //Let go through the file till the end  
                        while(file.good()) {
                                c = file.get();
                                if (isalnum(c)) {
                    //convert any uppercase to lowercase
                    if(isupper(c)) {
                                                c = (tolower(c));
                                        }
                    //if its a space lets go onto the next char
                                        if(isspace(c)) { continue; }
                    //everything looks good lets add the char to our word
                                        aword.insert(aword.end(),c);
                                } else {
                    //its not a alphnum or a space so lets skip it
                    if(!isspace(c)) { continue; }
                    //reset our string and increment
                                        if (aword != "") {sv.push_back(aword);}
                                        aword = "";
                                        continue;
                                }
                        }
    return sv;
}

vector<PAIR> readwords::count_vector(vector<string> sv) {
    unsigned int i = 0;
    int j = 0;
    int match = 0;

    // cout << "Working with these string: " << endl;
    // print_vector(sv);

    for (i=0; i < sv.size(); i++) {
        // cout << "count of i: " << i << " word is: " << sv.at(i) << endl;

        match = 0;
        if(readwords::vp.size() == 0) {
            readwords::vp.push_back(make_pair(sv.at(i),1)); continue;
        }

        for (j=readwords::vp.size() - 1; j >= 0; --j) {
            if (sv.at(i) == readwords::vp.at(j).first) {
                // cout << "Match found with: " << sv.at(i) << endl;;
                readwords::vp.at(j).second = readwords::vp.at(j).second + 1;
                match = 1;
            } 

            // cout << "Value of j and match: " << j << match << endl;
            if ( j == 0 && match == 0) {
                // cout << "Match found at end with: " << sv.at(i) << endl;;
                readwords::vp.push_back(make_pair(sv.at(i),1));
            }
        }
    }

    //Prob need to sort by first data type then second here, prior to sort functions.
    //Might not be the best place as the sort functions would alter it, if not here
    //then each sort requires to do secondary search

    return readwords::vp;
}

void readwords::print_vectorpair(vector<PAIR> vp) {
    unsigned int i = 0;

    for (i=0; i < vp.size(); ++i) {
        cout << vp.at(i).first << " " << vp.at(i).second << endl;
    }
}

void readwords::print_vector(vector<string> sv) {
    unsigned int i = 0;

    for (i=0; i < sv.size(); ++i) {
        cout << sv.at(i) << endl;
    }
}

void readwords::heapsort(char how[]) {
    int heapsize = (readwords::vp.size() - 1);

    buildmaxheap(readwords::vp, heapsize, how);

    for(int i=(readwords::vp.size() - 1); i >= 0; i--) {
        swap(readwords::vp[0],readwords::vp[i]);
        heapsize--;
        max_heapify(readwords::vp, 0, heapsize, how);
    }

    print_vectorpair(readwords::vp);
}

void readwords::buildmaxheap(vector<PAIR> &vp, int heapsize, char how[]) {

    for(int i=(heapsize/2); i >= 0 ; i--) {
        max_heapify(vp, i, heapsize, how);
    }
}

void readwords::max_heapify(vector<PAIR> &vp, int i, int heapsize, char how[]) {
    int left = ( 2 * i ) + 1;
    int right = left + 1;
    int largest;

    if(!strcmp(how,"v")) {  
        if(left <= heapsize && vp.at(left).second >= vp.at(i).second ) {
            if( vp.at(left).first >= vp.at(i).first ) {
                largest = left;
            } else {
                largest = i;
            }
        } else {
            largest = i;
        }

        if(right <= heapsize && vp.at(right).second >= vp.at(largest).second) {
            if( vp.at(right).first >= vp.at(largest).first) {
                largest = right;
            }
        }   
    }

    if(!strcmp(how,"k")) {  
        if(left <= heapsize && vp.at(left).first > vp.at(i).first) {
            largest = left;
        } else {
            largest = i;
        }

        if(right <= heapsize && vp.at(right).first > vp.at(largest).first) {
            largest = right;
        }   
    }

    if(largest != i) {
        swap(vp[i], vp[largest]);
        max_heapify(vp, largest, heapsize, how);
    }
}
Deduplicator
  • 44,692
  • 7
  • 66
  • 118
Ryan
  • 167
  • 1
  • 3
  • 12
  • 6
    I think if you want this question answered, you should do some work on this program. Remove all the stuff that is working (i.e. remove the timing code, the file reading code and the sorting by key code) and just post a smaller program that isn't working. Personally I'm not prepared to examine a program that big for bugs. You may also find that by cutting out the working code you discover what the problem is for yourself. – john Oct 13 '12 at 06:17

2 Answers2

1

The vector is then sorted by either the string (value or v) or by int (key or k).

That description doesn't match the code, sorting with a how parameter of "k" sorts by the first component only, which is the string, and sorting with "v" as how parameter takes both components into account.

I think it's a rather bad idea to pass a char[] to determine the sorting criterion, it should be a comparator function, so you need only one implementation in max_heapify.

My sorting by Key works fine however sort by value is off. I suspect I'm missing an if statement in max_heapify when sorting by value.

The problem is that a heap sort needs a total ordering or it won't sort properly.

Your conditions

if(left <= heapsize && vp.at(left).second >= vp.at(i).second ) {
    if( vp.at(left).first >= vp.at(i).first ) {
        largest = left;
    } else {
        largest = i;
    }
} else {
    largest = i;
}

check whether both components of vp.at(left) (resp. right) are at least as large as the corresponding component of vp.at(i), resulting in the product partial ordering, two general pairs are not comparable, and in that case, your max_heapify doesn't do anything.

Example, for <"a",3>, <"b",2> and <"c",1> in the positions i, left, right, in whichever order, your max_heapify sets largest to i.

If your sorting by "v" is meant to sort based on the int component first, and in case of a tie, take the string component into account, you'd need to distinguish the cases vp.at(left).second > vp.at(i).second and equality (for right too, of course). For example

if(left <= heapsize && vp.at(left).second >= vp.at(i).second ) {
    if(vp.at(left).second > vp.at(i).second || vp.at(left).first >= vp.at(i).first ) {
        largest = left;
    } else {
        largest = i;
    }
} else {
    largest = i;
}
Daniel Fischer
  • 181,706
  • 17
  • 308
  • 431
-1

To sort a vector<pair<string, int> > by values, consider adding vector<pair<int, string> >

vector<pair<int, string> > v(orignal.size());
for (int i = 0; i < v.size(); ++i) v[i] = make_pair(original[i].second, original[i].first);
sort(v.begin(), v.end());