Tarik Guelzim

Computer Science, Networking, Telecom, Web 2.0 & Open Source

An expert is a person who has made all the mistakes that can be made in a very narrow field.


--- Niels Bohr (1885 - 1962)

I recently came across a need to search the “bulletin officiel” in Morocco, which is a government magazine that lists laws, verdicts, explanations as well as all other official material. Given that the solution presented by a govenment website didn’t allow searching this database, I decided to create a search engine that parses, indexes and returns results for queries on terms, Google-style strings and regex. I among other requirements, this search engine must be platform independent and command line based in order to integrate it with Linux shell as well as CGI scripts. I designed it using Java and Lucene, aspriseX and pdfbox. The following is the code (beta) that demonstrate this functionality:


/**
* Tarik Guelzim
* This is an indexer/search engine for pdf files
* Version 0.1
*
* OCR funcitonality is still in pre-alpha
*/

package core;

import com.asprise.util.ocr.OCR;
import com.asprise.util.pdf.PDFReader;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;

public class Main {

/**
* @param args pdf file o be added
*/
public static void main(String[] args) {
HashMap params = null;
// parse args
params = parseArgs(args);
// dispatch commands
if (((String) params.get(”cmd”)).equals(”index”)) {
// build index
buildIndex((String) params.get(”idxFile”),
(String) params.get(”pdfFile”));

} else if (((String) params.get(”cmd”)).equals(”query”)) {

displayIndex(searchIndex((String) params.get(”idxFile”),
(String) params.get(”qString”),
(String) params.get(”dField”)));
} else if (((String) params.get(”cmd”)).equals(”optimize”)) {
optimizeIndex(((String) params.get(”idxFile”)));
} else if (((String) params.get(”cmd”)).equals(”ocr”)) {
ocrPdf((String) params.get(”pdfFile”));
} else {
System.err.println(”Usage: pdfagent {query | index} index_file pdf_file_name”);
}
}

/**
* OCR text from images
* @param fname pdf file
*/
private static void ocrPdf(String fname) {
try {
OCR ocr = new OCR();
PDFReader reader = new PDFReader(new File(fname));
reader.open();
int pages = reader.getNumberOfPages();
for (int i = 0; i < pages; i++) {
BufferedImage image = reader.getPageAsImage(i);
System.out.println(ocr.recognizeEverything(image));
}
reader.close();
} catch (IOException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}

/**
* optimize index
* @param idxFile index path
*/
private static void optimizeIndex(String idxFile) {
try {
IndexWriter iw = new IndexWriter(idxFile, new StandardAnalyzer(), isIndexExist(idxFile));
iw.optimize();
} catch (IOException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}

/**
* build pd index
* @param idxFile index file path
* @param pdfFile pdf file
*/
private static void buildIndex(String idxFile, String pdfFile) {

IndexWriter iw = null;
LucenePDFDocument pdfDoc = null;
Document luceneDoc = null;

try {
// lucene index

iw = new IndexWriter(idxFile, new StandardAnalyzer(),
isIndexExist(idxFile));

pdfDoc = new LucenePDFDocument();
luceneDoc = LucenePDFDocument.getDocument(new FileInputStream(pdfFile));

// luceneDoc = pdfDoc.convertDocument(new File(pdfFile));
addDoc(iw, luceneDoc);

iw.optimize();
iw.close();

} catch (IOException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}

private static void displayIndex(Hits hits) {
try {
if (hits.length() == 0) {
System.out.println(”No results found.”);
} else {
for (int i = 0; i < hits.length(); i++) {
System.out.println(”————————-”);
System.out.println(”URL: ” + hits.doc(i).get(”url”));
System.out.println(”Modified: ” + hits.doc(i).get(”modified”));
System.out.println(”————————-”);

System.out.println(hits.doc(i).get(”contents”));
}
}
} catch (IOException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}

/**
* search the index for a striĀ  ng
* @param indexFile index file path
* @param qString string query
* @param idxField the field to search
* @return hits for the query
*/
private static Hits searchIndex(String indexFile, String qString, String idxField) {
Hits hits = null;
IndexSearcher searcher = null;
Directory fsDir = null;
Query query = null;
QueryParser qParser = null;

try {
//query index
System.out.println(”Searching for ” + qString + ” in ” + indexFile + ” — field: ” + idxField);

// create index searcher
fsDir = FSDirectory.getDirectory(indexFile, false);
searcher = new IndexSearcher(fsDir);
// parse query
qParser = new QueryParser(idxField, new StandardAnalyzer());
query = qParser.parse(qString);
//query = QueryParser.parse(qString, idxField, new StandardAnalyzer());
System.out.println(query.toString());
// search
hits = searcher.search(query);

} catch (IOException ex) {
System.err.println(”Index file” + indexFile + ” doesn’t exist”);
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
System.exit(1);
} catch (ParseException ex) {
System.err.println(”Error while parsing the index.”);
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
} finally {
return hits;
}

}

/**
* Singelton for indx file
* @param idxFile file to check
* @return exist or not
*/
private static boolean isIndexExist(String idxFile) {
File file;
boolean create = true;

if (((file = new File(idxFile)).exists() && file.isDirectory())) {
create = false;
}

return create;
}

/**
* parse cmd line args
* @param args params
* @return array of params
*/
private static HashMap parseArgs(String[] args) {
HashMap cmdMap = new HashMap();
switch (args.length) {
case 2:
cmdMap.put(”cmd”, (String) args[0]);
cmdMap.put(”idxFile”, (String) args[1]);
cmdMap.put(”pdfFile”, (String) args[1]);
break;
case 3:
cmdMap.put(”cmd”, (String) args[0]);
cmdMap.put(”idxFile”, (String) args[1]);
cmdMap.put(”pdfFile”, (String) args[2]);
break;
case 4:
cmdMap.put(”cmd”, (String) args[0]);
cmdMap.put(”idxFile”, (String) args[1]);
cmdMap.put(”qString”, (String) args[2]);
cmdMap.put(”dField”, (String) args[3]);
break;
default:
System.err.println(”Usage: pdfagent {query | index} index_file pdf_file_name”);
System.exit(1);
break;
}
return cmdMap;
}

/**
* Add document to the index
* @param iw
* @param doc
*/
private static void addDoc(IndexWriter iw, Document doc) {
try {
iw.addDocument(doc);

} catch (IOException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}
}

Comments are closed.