Technology hub‎ > ‎Search and Crawl‎ > ‎

Java changes nutch cookie based authentication Step 2



Step 2: Java modifications to Httpclient plugin (protocol-httpclient)


Modify class org.apache.nutch.protocol.httpclient.Http.java to add below methods/variables

Modify setConf to add new variables and split and set comma separated values to variables

// JDK imports

import java.io.InputStream;

import java.io.IOException;

import java.net.URL;

import java.util.ArrayList;

import java.util.Collection;

import java.util.HashSet;


import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

import org.w3c.dom.NodeList;

import org.w3c.dom.Node;


// Commons Logging imports

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;


// HTTP Client imports

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HostConfiguration;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.NTCredentials;

import org.apache.commons.httpclient.auth.AuthScope;

import org.apache.commons.httpclient.params.HttpConnectionManagerParams;

import org.apache.commons.httpclient.protocol.Protocol;

import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;

import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;


// Nutch imports

import org.apache.nutch.storage.WebPage;

import org.apache.nutch.storage.WebPage.Field;

import org.apache.nutch.net.protocols.Response;

import org.apache.nutch.protocol.ProtocolException;

import org.apache.nutch.protocol.http.api.HttpBase;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.util.NutchConfiguration;


    /**
     * Reads the configuration from the Nutch configuration files and sets the
     * configuration.
     *
     * @param conf
     *            Configuration
     */
    public void setConf(Configuration conf) {
        super.setConf(conf);
        Http.conf = conf;
        this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
        this.proxyUsername = conf.get("http.proxy.username", "");
        this.proxyPassword = conf.get("http.proxy.password", "");
        this.proxyRealm = conf.get("http.proxy.realm", "");
        agentHost = conf.get("http.agent.host", "");
        authFile = conf.get("http.auth.file", "");
        cookiePolicy = conf.get("http.auth.cookie.policy", "");
       
        if(cookiePolicy == null || cookiePolicy.equals("")) {
            cookiePolicy = CookiePolicy.DEFAULT;
        }
       
        String csvAuthUrls = conf.get("http.auth.csv.urls", "");
        String cookieNames = conf.get("http.auth.csv.cookienames", "");
        if(null != cookieNames && !cookieNames.equals("")) {
            authSessionCookieNames = Arrays.asList(cookieNames.split(","));
        }
       
        if(null != csvAuthUrls && !csvAuthUrls.equals("")) {
            authURLs = Arrays.asList(csvAuthUrls.split(","));
        }
       
       
        configureClient();
        try {
            setCredentials();
        } catch (Exception ex) {
            if (LOG.isErrorEnabled()) {
                LOG.error("Could not read " + authFile + " : "
                        + ex.getMessage());
            }
        }
    }


Override getResponse() to add loop to go through authentication URLs (instead of any first url to do NLM/Basic/other http auth). Use resolveCredentials to authenticate using specific urls. Every time cookies don't exist it authenticates.


    /**
     * Fetches the <code>url</code> with a configured HTTP client and gets the
     * response.
     *
     * @param url
     *            URL to be fetched
     * @param datum
     *            Crawl data
     * @param redirect
     *            Follow redirects if and only if true
     * @return HTTP response
     */
    protected Response getResponse(URL url, WebPage page, boolean redirect)
            throws ProtocolException, IOException {
       
        for(int i=0; i < authURLs.size(); i++) {

            String authURL = authURLs.get(i);
           
            if(authURL != null && !authCookiesExist()) {
               
                if (LOG.isInfoEnabled()) {
                    LOG.info("Hitting Auth URL " + authURL);
                }
               
                URL urlAuthUrl = new URL(authURL);
                resolveCredentials(urlAuthUrl);
                new HttpResponse(this, urlAuthUrl, page, redirect, cookiePolicy);
            }
        }
       
        resolveCredentials(url);
        HttpResponse httpResponse = new HttpResponse(this, url, page, redirect, cookiePolicy);
       
        return httpResponse;
    }


Utility method to check if auth cookies exists (If not authenticate for the first time or once again)

    private boolean authCookiesExist() {
       
        Cookie[] cookies = client.getState().getCookies();
       
        if(cookies == null)  return false;
       
        List<String> currentCookieNames = new ArrayList<String>();
       
        for(int i = 0; i < cookies.length; i++) {
            currentCookieNames.add(cookies[i].getName());
        }
       
        return currentCookieNames.containsAll(authSessionCookieNames);
    }



Comments