Java changes nutch cookie based authentication Step 2
<< Step 1 Back to Configuration
Step 2: Java modifications to Httpclient plugin (protocol-httpclient)
Modify class org.apache.nutch.protocol.httpclient.Http.java to add below methods/variables
Modify setConf to add new variables and split and set comma separated values to variables
// JDK imports
import java.io.InputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// HTTP Client imports
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
// Nutch imports
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
/**
* Reads the configuration from the Nutch configuration files and sets the
* configuration.
*
* @param conf
* Configuration
*/
public void setConf(Configuration conf) {
super.setConf(conf);
Http.conf = conf;
this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
this.proxyUsername = conf.get("http.proxy.username", "");
this.proxyPassword = conf.get("http.proxy.password", "");
this.proxyRealm = conf.get("http.proxy.realm", "");
agentHost = conf.get("http.agent.host", "");
authFile = conf.get("http.auth.file", "");
cookiePolicy = conf.get("http.auth.cookie.policy", "");
if(cookiePolicy == null || cookiePolicy.equals("")) {
cookiePolicy = CookiePolicy.DEFAULT;
}
String csvAuthUrls = conf.get("http.auth.csv.urls", "");
String cookieNames = conf.get("http.auth.csv.cookienames", "");
if(null != cookieNames && !cookieNames.equals("")) {
authSessionCookieNames = Arrays.asList(cookieNames.split(","));
}
if(null != csvAuthUrls && !csvAuthUrls.equals("")) {
authURLs = Arrays.asList(csvAuthUrls.split(","));
}
configureClient();
try {
setCredentials();
} catch (Exception ex) {
if (LOG.isErrorEnabled()) {
LOG.error("Could not read " + authFile + " : "
+ ex.getMessage());
}
}
}
Override getResponse() to add loop to go through authentication URLs (instead of any first url to do NLM/Basic/other http auth). Use resolveCredentials to authenticate using specific urls. Every time cookies don't exist it authenticates.
/**
* Fetches the <code>url</code> with a configured HTTP client and gets the
* response.
*
* @param url
* URL to be fetched
* @param datum
* Crawl data
* @param redirect
* Follow redirects if and only if true
* @return HTTP response
*/
protected Response getResponse(URL url, WebPage page, boolean redirect)
throws ProtocolException, IOException {
for(int i=0; i < authURLs.size(); i++) {
String authURL = authURLs.get(i);
if(authURL != null && !authCookiesExist()) {
if (LOG.isInfoEnabled()) {
LOG.info("Hitting Auth URL " + authURL);
}
URL urlAuthUrl = new URL(authURL);
resolveCredentials(urlAuthUrl);
new HttpResponse(this, urlAuthUrl, page, redirect, cookiePolicy);
}
}
resolveCredentials(url);
HttpResponse httpResponse = new HttpResponse(this, url, page, redirect, cookiePolicy);
return httpResponse;
}
Utility method to check if auth cookies exists (If not authenticate for the first time or once again)
private boolean authCookiesExist() {
Cookie[] cookies = client.getState().getCookies();
if(cookies == null) return false;
List<String> currentCookieNames = new ArrayList<String>();
for(int i = 0; i < cookies.length; i++) {
currentCookieNames.add(cookies[i].getName());
}
return currentCookieNames.containsAll(authSessionCookieNames);
}