Java changes nutch cookie based authentication Step 2

<< Step 1 Back to Configuration

Step 2: Java modifications to Httpclient plugin (protocol-httpclient)

Modify class org.apache.nutch.protocol.httpclient.Http.java to add below methods/variables

Modify setConf to add new variables and split and set comma separated values to variables

// JDK imports

import java.io.InputStream;

import java.io.IOException;

import java.net.URL;

import java.util.ArrayList;

import java.util.Collection;

import java.util.HashSet;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

import org.w3c.dom.NodeList;

import org.w3c.dom.Node;

// Commons Logging imports

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

// HTTP Client imports

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HostConfiguration;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.NTCredentials;

import org.apache.commons.httpclient.auth.AuthScope;

import org.apache.commons.httpclient.params.HttpConnectionManagerParams;

import org.apache.commons.httpclient.protocol.Protocol;

import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;

import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;

// Nutch imports

import org.apache.nutch.storage.WebPage;

import org.apache.nutch.storage.WebPage.Field;

import org.apache.nutch.net.protocols.Response;

import org.apache.nutch.protocol.ProtocolException;

import org.apache.nutch.protocol.http.api.HttpBase;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.util.NutchConfiguration;

/**

* Reads the configuration from the Nutch configuration files and sets the

* configuration.

*

* @param conf

* Configuration

*/

public void setConf(Configuration conf) {

super.setConf(conf);

Http.conf = conf;

this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);

this.proxyUsername = conf.get("http.proxy.username", "");

this.proxyPassword = conf.get("http.proxy.password", "");

this.proxyRealm = conf.get("http.proxy.realm", "");

agentHost = conf.get("http.agent.host", "");

authFile = conf.get("http.auth.file", "");

cookiePolicy = conf.get("http.auth.cookie.policy", "");

if(cookiePolicy == null || cookiePolicy.equals("")) {

cookiePolicy = CookiePolicy.DEFAULT;

}

String csvAuthUrls = conf.get("http.auth.csv.urls", "");

String cookieNames = conf.get("http.auth.csv.cookienames", "");

if(null != cookieNames && !cookieNames.equals("")) {

authSessionCookieNames = Arrays.asList(cookieNames.split(","));

}

if(null != csvAuthUrls && !csvAuthUrls.equals("")) {

authURLs = Arrays.asList(csvAuthUrls.split(","));

}

configureClient();

try {

setCredentials();

} catch (Exception ex) {

if (LOG.isErrorEnabled()) {

LOG.error("Could not read " + authFile + " : "

+ ex.getMessage());

}

}

}

Override getResponse() to add loop to go through authentication URLs (instead of any first url to do NLM/Basic/other http auth). Use resolveCredentials to authenticate using specific urls. Every time cookies don't exist it authenticates.

/**

* Fetches the <code>url</code> with a configured HTTP client and gets the

* response.

*

* @param url

* URL to be fetched

* @param datum

* Crawl data

* @param redirect

* Follow redirects if and only if true

* @return HTTP response

*/

protected Response getResponse(URL url, WebPage page, boolean redirect)

throws ProtocolException, IOException {

for(int i=0; i < authURLs.size(); i++) {

String authURL = authURLs.get(i);

if(authURL != null && !authCookiesExist()) {

if (LOG.isInfoEnabled()) {

LOG.info("Hitting Auth URL " + authURL);

}

URL urlAuthUrl = new URL(authURL);

resolveCredentials(urlAuthUrl);

new HttpResponse(this, urlAuthUrl, page, redirect, cookiePolicy);

}

}

resolveCredentials(url);

HttpResponse httpResponse = new HttpResponse(this, url, page, redirect, cookiePolicy);

return httpResponse;

}

Utility method to check if auth cookies exists (If not authenticate for the first time or once again)

private boolean authCookiesExist() {

Cookie[] cookies = client.getState().getCookies();

if(cookies == null) return false;

List<String> currentCookieNames = new ArrayList<String>();

for(int i = 0; i < cookies.length; i++) {

currentCookieNames.add(cookies[i].getName());

}

return currentCookieNames.containsAll(authSessionCookieNames);

}

<< Step 1 Back to Configuration