Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for issue #116 #215

Closed
wants to merge 13 commits into from
9 changes: 9 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,15 @@
<role>Developer</role>
</roles>
</developer>
<developer>
<name>Mohamed Elsayed</name>
<url>https://github.com/MohammedElsayyed</url>
<organization>The New Library of Alexandria</organization>
<organizationUrl>http://bibalex.org/</organizationUrl>
<roles>
<role>Developer</role>
</roles>
</developer>
<developer>
<name>John Erik Halse</name>
<url>https://github.com/johnerikhalse</url>
Expand Down
1 change: 1 addition & 0 deletions src/site/xdoc/release_notes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
<li>Fixed issue <a href="https://github.com/iipc/openwayback/issues/48">#48</a> jQuery getting stomped on.</li>
<li>Support for loading resources from S3 buckets. <a href="https://github.com/iipc/openwayback/issues/189">#189</a></li>
<li>Refactored CDX Server into a war and jar module. <a href="https://github.com/iipc/openwayback/issues/164">#164</a></li>
<li>Fixed ARCRecordingProxy times out. <a href="https://github.com/iipc/openwayback/issues/116">#116</a></li>
</ul>
</subsection>
</section>
Expand Down
6 changes: 6 additions & 0 deletions wayback-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@
<version>2.5.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
<type>jar</type>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,17 @@
import org.archive.wayback.resourcestore.resourcefile.ResourceFactory;

/**
* This class fetches resource from live web.
* It works with ArcRecordingProxy not standard proxy server
*
* @author brad
* @see LiveWebCache
* @see StdRemoteLiveWebCache
*
*/
public class RemoteLiveWebCache implements LiveWebCache {
public class ArcRemoteLiveWebCache implements LiveWebCache {
private static final Logger LOGGER = Logger.getLogger(
RemoteLiveWebCache.class.getName());
ArcRemoteLiveWebCache.class.getName());

protected MultiThreadedHttpConnectionManager connectionManager = null;
protected HostConfiguration hostConfiguration = null;
Expand All @@ -61,7 +66,7 @@ public class RemoteLiveWebCache implements LiveWebCache {
/**
*
*/
public RemoteLiveWebCache() {
public ArcRemoteLiveWebCache() {
connectionManager = new MultiThreadedHttpConnectionManager();
hostConfiguration = new HostConfiguration();
HttpClientParams params = new HttpClientParams();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import com.google.common.io.ByteStreams;

public class LiveRobotsNoCache extends RemoteLiveWebCache {
public class LiveRobotsNoCache extends ArcRemoteLiveWebCache {

protected int maxRobotsSize = 512000;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
/*
* Copyright 2014 Bibliotheca Alexandrina.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.archive.wayback.liveweb;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.logging.Logger;
import org.apache.commons.httpclient.ConnectTimeoutException;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NoHttpResponseException;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.archive.io.arc.ARCRecord;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
import org.archive.wayback.exception.LiveWebTimeoutException;
import org.archive.wayback.exception.ResourceNotAvailableException;
import org.archive.wayback.resourcestore.resourcefile.ArcResource;
import org.archive.wayback.resourcestore.resourcefile.ResourceFactory;

/**
* This class fetches resource from live web.
* It works with standard proxy server e.g. Squid.
*
* @author Mohamed Elsayed
* @see LiveWebCache
* @see ArcRemoteLiveWebCache
*/
public class StdRemoteLiveWebCache implements LiveWebCache
{
private static final Logger LOGGER = Logger.getLogger(
StdRemoteLiveWebCache.class.getName() );

protected MultiThreadedHttpConnectionManager connectionManager;
protected HostConfiguration hostConfiguration;
protected HttpClient httpClient;
protected String requestPrefix;
private CloseableHttpResponse response;
private ArcResource ar;

/**
* StdRemoteLiveWebCache constructor initializes and configures connection objects.
*/
public StdRemoteLiveWebCache()
{
connectionManager = new MultiThreadedHttpConnectionManager();
hostConfiguration = new HostConfiguration();
HttpClientParams params = new HttpClientParams();
params.setParameter( HttpClientParams.RETRY_HANDLER,
new NoRetryHandler() );
httpClient = new HttpClient( params, connectionManager );
httpClient.setHostConfiguration( hostConfiguration );
}

/**
* Gets resource object from the live web. Configure timeout to 10 seconds.
*
* @param url to fetch from the live web.
* @param maxCacheMS maximum age of resource to return - optionally honored
* @param bUseOlder if true, return documents older than maxCacheMS if
* a more recent copy is not available.
*
* @return Resource for url
*
* @throws LiveDocumentNotAvailableException if the resource cannot be
* retrieved from the live web, but all proxying and caching
* mechanisms functioned properly
* @throws LiveWebCacheUnavailableException if there was a problem either
* accessing the live web, in proxying to the live web, or in
* maintaining the cache for the live web
* @throws LiveWebTimeoutException if there is no response from the live
* web cache before a timeout occurred.
* @throws IOException for the usual reasons
*
* @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean)
* @inheritDoc org.archive.wayback.liveweb.LiveWebCache#getCachedResource
*/
@Override
public Resource getCachedResource( URL url, long maxCacheMS,
boolean bUseOlder )
throws LiveDocumentNotAvailableException,
LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException
{
String urlStr = url.toExternalForm();

if (requestPrefix != null)
urlStr = requestPrefix + urlStr;

HttpHost proxy = new HttpHost( hostConfiguration.getProxyHost(),
hostConfiguration.getProxyPort() );

// Set socketTimeout and connectionTimeout to 10 seconds.
RequestConfig reqConf = RequestConfig.custom().setProxy( proxy )
.setSocketTimeout( 10000 )
.setConnectTimeout( 10000 )
.setConnectionRequestTimeout( 10000 )
.build();
CloseableHttpClient httpclient = HttpClients.custom().
setDefaultRequestConfig(reqConf).build();
HttpGet httpGet = new HttpGet( urlStr );

try
{
// The following line gets robots.txt from live web
response= httpclient.execute( httpGet );

String httpHeaderStr = "";
String bodyStr = "";

/* If it fails to get robots.txt (http status code is 404),
then display contents and don't throw exception
(socketTimeOutException or connectTimeOutException)
*/
if ( response.getStatusLine().getStatusCode() == 404 )
{
httpHeaderStr = "HTTP/1.0 200 OK\n";
bodyStr = String.format( "%s\n%s\n",
"User-agent: *", "Allow: /" );
}
else if ( response.getStatusLine().getStatusCode() == 200 )
{
// The following line represents first line in http header
httpHeaderStr = String.format( "%s %d %s\n",
response.getStatusLine().getProtocolVersion(),
response.getStatusLine().getStatusCode(),
response.getStatusLine().getReasonPhrase() );

// Get robots.txt contents and store it into bodyStr
HttpEntity entity = response.getEntity();
bodyStr = EntityUtils.toString(entity);
}

// Get Http Header and store complete http header in httpHeaderStr
for ( Header header : response.getAllHeaders() )
httpHeaderStr += header.toString() + "\n";

httpHeaderStr += "\n";
int length = httpHeaderStr.length() + bodyStr.length();

/*
Using httpHeaderStr and bodyStr to construct responseStr.
First line in responseStr should exist.
*/

// TODO: the following line should be enhanced,
// especially the first line in responseStr.
String responseStr = String.format( "%s %s %d\n%s%s", urlStr,
"0.0.0.0 10000000000000 text/plain", length,
httpHeaderStr, bodyStr );

ByteArrayInputStream bais = new ByteArrayInputStream(
responseStr.getBytes() );

// TODO: Should not use ARCRecord
ARCRecord r = new ARCRecord( bais, "id", 0L, false, false, true );
ar = ( ArcResource ) ResourceFactory.ARCArchiveRecordToResource( r, null );

if ( ar.getStatusCode() == 502 )
{
throw new LiveDocumentNotAvailableException( urlStr );
}
else if ( ar.getStatusCode() == 504 )
{
throw new LiveWebTimeoutException( "Timeout:" + urlStr );
}

return ar;
}
catch( ResourceNotAvailableException e )
{
throw new LiveDocumentNotAvailableException( urlStr );
}
catch( NoHttpResponseException e )
{
throw new LiveWebCacheUnavailableException( "No Http Response for " +
urlStr );
}
catch( ConnectException e )
{
throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() +
" : " + urlStr );
}
catch ( SocketException e )
{
throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() +
" : " + urlStr );
}
catch ( SocketTimeoutException e )
{
throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " +
urlStr );
}
catch( ConnectTimeoutException e )
{
throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " +
urlStr );
}
finally
{
response.close();
}
}

/**
* Sets proxy and port (proxy:port).
*
* @param hostPort to proxy requests through - ex. "localhost:3128"
*/
public void setProxyHostPort( String hostPort )
{
int colonIdx = hostPort.indexOf( ':' );
if(colonIdx > 0)
{
String host = hostPort.substring( 0,colonIdx );
int port = Integer.valueOf( hostPort.substring( colonIdx+1 ) );
hostConfiguration.setProxy( host, port );
}
}

/**
*
* @see org.archive.wayback.liveweb.LiveWebCache#shutdown()
*/
@Override
public void shutdown()
{
throw new UnsupportedOperationException( "Not supported yet." ); //To change body of generated methods, choose Tools | Templates.
}
}
11 changes: 7 additions & 4 deletions wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,15 @@


<bean id="proxylivewebcache"
class="org.archive.wayback.liveweb.RemoteLiveWebCache">
class="org.archive.wayback.liveweb.ArcRemoteLiveWebCache">
<property name="proxyHostPort" value="localhost:8099" />
<!--
If you've set up a local squid/varnish to cache requests to the above
ARCRecordingProxy, you should use the port for that, instead of 8099:
<property name="proxyHostPort" value="localhost:3128" />
If you've set up a local squid/varnish to cache requests to the above
ARCRecordingProxy, it is preferable to use StdRemoteLiveWebCache instead of
ArcRemoteLiveWebCache and you should use the port for that e.g. 3128, instead of 8099:
<bean id="proxylivewebcache"
class="org.archive.wayback.liveweb.StdRemoteLiveWebCache">
<property name="proxyHostPort" value="localhost:3128" />
-->
</bean>
<bean id="excluder-factory-robot" class="org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory">
Expand Down