Uncategorized

Solr Search Engine for Desktop

                     It takes good amount of time to find a file in Windows machines. Here is simple system where we can index all the desktop files in solr search engine, a famous open source search engine and can be searched in seconds.

Solr is an open source search engine from apache solr https://lucene.apache.org/solr/.

Step 1: Download the solr and run it in cloud mode. Here is the guide to get start on solr. https://lucene.apache.org/solr/guide/7_7/solr-tutorial.html#solr-tutorial

once you download and extract the solr into a file, just start the solr with following command

>./bin/solr start -c -p 8983 -s ../example/cloud/node1/solr

This will start the first node.  This may take approximately 30 sec to start it. And it will also start zookeeper ensemble to manage the nodes. Once first node starts, here is the command to start second node.

>./bin/solr start -c -p 7574 -s ../example/cloud/node2/solr -z localhost:9983

this will start the solr engine in two node cluster and can be accessed at http://localhost:8983/solr

Step 2: Create a collection for indexing files

>solr create -c pdf-files -s 2 -rf 2

Step 3: Write a program to extract and index the files names in solr search Engines.

package com.kvn.web.solrClients;

import java.io.File;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.function.Consumer;

import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;

public class CustomCloudSolrClient {

	/**
	 * Solr URL, which is running in local machine.
	 */
	String baseUrl = "http://localhost:8983/solr/pdf-files";

	/**
	 * Folders to index. you may not need to index all the folders.
	 */
	String baseFoldersToIndex[] = { "C:/docs", "C:/Java Majestic", "C:/JavaMajestic", "C:/lastminute.comdocs",
			"C:/Users/prabhukvn/Documents", "c:/Users/prabhukvn/Downloads" };
	/**
	 * What kind of files you want to index.
	 */
	String fileType[] = { ".pdf", ".doc", ".docx" };

	/**
	 * A Blocking queue to control the threads.
	 */
	BlockingQueue<File> fileNameQueue = new LinkedBlockingDeque<File>(20);

	/**
	 *  It starts here.
	 * @param args
	 */
	public static void main(String[] args) {
		CustomCloudSolrClient client = new CustomCloudSolrClient();
		client.start();
	}

	/**
	 * Start the indexing process.
	 */
	private void start() {
		try {

			HttpSolrClient solrClient = connect();
			Arrays.stream(baseFoldersToIndex).forEach(applyIndex(solrClient));
			solrClient.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	private Consumer<? super String> applyIndex(HttpSolrClient solrClient) {
		return baseFolderToIndex -> {
			try {
				fileNameQueue.put(new File(baseFolderToIndex));
				pushDocs(solrClient);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		};
	}

	private void pushDocs(HttpSolrClient solrClient) {
		try {
			File folderPath = fileNameQueue.take();
			File fileList[] = folderPath.listFiles();
			List<FileDoc> fileDocs = new ArrayList<FileDoc>();
			Arrays.stream(fileList).forEach(pushDocsToSolr(solrClient, fileDocs));

			if (fileDocs.size() > 0) {
				System.out.println("Indexing from" + folderPath.getAbsolutePath() + " And Size:" + fileDocs.size());
				solrClient.addBeans(fileDocs);
				solrClient.commit();
			}
		} catch (IOException e) {
			e.printStackTrace();
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (InterruptedException e1) {
			e1.printStackTrace();
		}

	}

	private Consumer<? super File> pushDocsToSolr(HttpSolrClient solrClient, List<FileDoc> fileDocs) {
		return f -> {
			if (null != f) {
				Arrays.stream(fileType).forEach(element -> {

					if (f.getName().contains(element)) {
						fileDocs.add(new FileDoc(f.getName(), f.getAbsolutePath(), new Timestamp(f.lastModified())));
					}

				});
				if (f.isDirectory() && !f.getName().contains(".")) {
					try {
						fileNameQueue.put(f);
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
					pushDocs(solrClient);
				}
			}
		};
	}

	/**
	 * Connect to solr using solr client.
	 * 
	 * @return
	 */
	private HttpSolrClient connect() {
		return new HttpSolrClient.Builder(baseUrl).withConnectionTimeout(10000).withSocketTimeout(60000).build();

	}

}



And Maven Dependency
		<dependency>
			<groupId>org.apache.solr</groupId>
			<artifactId>solr-solrj</artifactId>
			<version>7.7.0</version>
		</dependency>

And use http://localhost:8983/solr/#/pdf-files/query to search the data.

Or http://localhost:8983/solr to open the dashboard

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s