I'm trying to implement a plagiarism detection software using pattern matching algorithms. I came across the KMP Algorithm Here and tried out the c# implementation. I can see that it's not as fast for actual documents (not strings, I uploaded two pdf documents using iText and got the implementation to check for plagiarism in these two papers. About 50 pages).
It's really slow and I have no idea how to go about this. I've looked at Boyer Moore and Rabin Karp as well.
What I am currently doing is taking each sentence in the document (split on '.') and scanning through the whole reference document (2nd document) for a match. Then taking the next sentence and so on... I am fully aware that this could be very expensive. But I have no idea how else to implement string (pattern) matching without using this approach. It's for my final year project and I was given a topic so I HAVE to use string matching. (Not allowed to do Citation based plagiarism, Semantics or Vector Space.)
The larger the text and pattern gets, the slower the algorithm gets (extremely slow, not even reasonably slow). Is there another way to go about this that I don't know? Or are there faster algorithms for me to use with this my approach?
EDIT
My code below:`
public class MatchChecker
{
public void KMPSearch(string pattern, string text, int page)
{
if (pattern.Trim() != "")
{
int M = pattern.Length;
int N = text.Length;
// create lps[] that will hold the longest
// prefix suffix values for pattern
int[] lps = new int[M];
int j = 0; // index for pat[]
// Preprocess the pattern (calculate lps[]
// array)
computeLPSArray(pattern, M, lps);
int i = 0; //index for text[]
while (i < N)
{
if (pattern[j] == text[i])
{
j++;
i++;
}
if (j == M)
{
Console.WriteLine("Found pattern " + pattern + " at page " + page);
j = lps[j - 1];
}
//mismatch after j matches
else if (i < N && pattern[j] != text[i])
{
//Do not match lps[0..lps[j-1]] characters,
//they will match anyway
if (j != 0)
j = lps[j - 1];
else
i = i + 1;
}
}
}
}
private void computeLPSArray(string pattern, int M, int[] lps)
{
//length of the previous longest prefix suffix
int length = 0;
int i = 1;
lps[0] = 0; //lps[0]is always 0
//the loop calculates lps[i] for i = 1 to M - 1
while (i < M)
{
if (pattern[i] == pattern[length])
{
length++;
lps[i] = length;
i++;
}
else // (pat[i] != pat[len])
{
// This is tricky. Consider the example.
// AAACAAAA and i = 7. The idea is similar
// to search step.
if (length != 0)
{
length = lps[length - 1];
// Also, note that we do not increment
// i here
}
else // if (len == 0)
{
lps[i] = length;
i++;
}
}
}
}
public string ReadDocPDF()
{
List<string> pages = new List<string>();
PdfReader reader2 = new PdfReader(@"C:\Users\obn\Desktop\project\chapter1.pdf");
string strText = string.Empty;
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader(@"C:\Users\obn\Desktop\project\chapter1.pdf");
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Regex.Replace(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s))).Replace(",", ""), "[0-9]", "").ToLower();
pages.Add(s);
strText = strText + s;
reader.Close();
}
return strText;
}
public void CheckForPlag()
{
string document = ReadDocPDF().Trim();
string[] sentences = document.Split(new string[] { "\n", "\t\n\r", ". ", "." }, StringSplitOptions.RemoveEmptyEntries);
foreach(string sentence in sentences) {
PdfReader reader2 = new PdfReader(@"C:\Users\obn\Documents\visual studio 2015\Projects\PlagDetector\PlagDetector\bin\Debug\test3.pdf");
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader(@"C:\Users\obn\Documents\visual studio 2015\Projects\PlagDetector\PlagDetector\bin\Debug\test3.pdf");
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Regex.Replace(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s))).Trim().Replace(".","").Replace(",","").Replace("\n", ""), "[0-9]", "").ToLower();
KMPSearch(sentence, s, page);
reader.Close();
}
}
}
}`