diff --git a/pom.xml b/pom.xml index 280c453291..6857930f81 100644 --- a/pom.xml +++ b/pom.xml @@ -355,6 +355,15 @@ Developer + + Mohamed Elsayed + https://github.com/MohammedElsayyed + The New Library of Alexandria + http://bibalex.org/ + + Developer + + John Erik Halse https://github.com/johnerikhalse diff --git a/src/site/xdoc/release_notes.xml b/src/site/xdoc/release_notes.xml index 1f090123c0..9f424c007d 100644 --- a/src/site/xdoc/release_notes.xml +++ b/src/site/xdoc/release_notes.xml @@ -53,6 +53,7 @@
  • Fixed issue #48 jQuery getting stomped on.
  • Support for loading resources from S3 buckets. #189
  • Refactored CDX Server into a war and jar module. #164
  • +
  • Fixed ARCRecordingProxy times out. #116
  • diff --git a/wayback-core/pom.xml b/wayback-core/pom.xml index e85cc16df5..aa20db4ca6 100644 --- a/wayback-core/pom.xml +++ b/wayback-core/pom.xml @@ -125,6 +125,12 @@ 2.5.1 test + + org.apache.httpcomponents + httpclient + 4.3.5 + jar + diff --git a/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java b/wayback-core/src/main/java/org/archive/wayback/liveweb/ArcRemoteLiveWebCache.java similarity index 96% rename from wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java rename to wayback-core/src/main/java/org/archive/wayback/liveweb/ArcRemoteLiveWebCache.java index 629572c697..fd3fd9414d 100644 --- a/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java +++ b/wayback-core/src/main/java/org/archive/wayback/liveweb/ArcRemoteLiveWebCache.java @@ -46,12 +46,17 @@ import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** + * This class fetches resource from live web. + * It works with ArcRecordingProxy not standard proxy server + * * @author brad + * @see LiveWebCache + * @see StdRemoteLiveWebCache * */ -public class RemoteLiveWebCache implements LiveWebCache { +public class ArcRemoteLiveWebCache implements LiveWebCache { private static final Logger LOGGER = Logger.getLogger( - RemoteLiveWebCache.class.getName()); + ArcRemoteLiveWebCache.class.getName()); protected MultiThreadedHttpConnectionManager connectionManager = null; protected HostConfiguration hostConfiguration = null; @@ -61,7 +66,7 @@ public class RemoteLiveWebCache implements LiveWebCache { /** * */ - public RemoteLiveWebCache() { + public ArcRemoteLiveWebCache() { connectionManager = new MultiThreadedHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); diff --git a/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveRobotsNoCache.java b/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveRobotsNoCache.java index dd7d54a247..389823ea89 100644 --- a/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveRobotsNoCache.java +++ b/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveRobotsNoCache.java @@ -16,7 +16,7 @@ import com.google.common.io.ByteStreams; -public class LiveRobotsNoCache extends RemoteLiveWebCache { +public class LiveRobotsNoCache extends ArcRemoteLiveWebCache { protected int maxRobotsSize = 512000; diff --git a/wayback-core/src/main/java/org/archive/wayback/liveweb/StdRemoteLiveWebCache.java b/wayback-core/src/main/java/org/archive/wayback/liveweb/StdRemoteLiveWebCache.java new file mode 100644 index 0000000000..645e7ebecb --- /dev/null +++ b/wayback-core/src/main/java/org/archive/wayback/liveweb/StdRemoteLiveWebCache.java @@ -0,0 +1,258 @@ +/* + * Copyright 2014 Bibliotheca Alexandrina. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.liveweb; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.ConnectException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.URL; +import java.util.logging.Logger; +import org.apache.commons.httpclient.ConnectTimeoutException; +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.NoHttpResponseException; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.HttpHost; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.resourcefile.ArcResource; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; + +/** + * This class fetches resource from live web. + * It works with standard proxy server e.g. Squid. + * + * @author Mohamed Elsayed + * @see LiveWebCache + * @see ArcRemoteLiveWebCache + */ +public class StdRemoteLiveWebCache implements LiveWebCache +{ + private static final Logger LOGGER = Logger.getLogger( + StdRemoteLiveWebCache.class.getName() ); + + protected MultiThreadedHttpConnectionManager connectionManager; + protected HostConfiguration hostConfiguration; + protected HttpClient httpClient; + protected String requestPrefix; + private CloseableHttpResponse response; + private ArcResource ar; + + /** + * StdRemoteLiveWebCache constructor initializes and configures connection objects. + */ + public StdRemoteLiveWebCache() + { + connectionManager = new MultiThreadedHttpConnectionManager(); + hostConfiguration = new HostConfiguration(); + HttpClientParams params = new HttpClientParams(); + params.setParameter( HttpClientParams.RETRY_HANDLER, + new NoRetryHandler() ); + httpClient = new HttpClient( params, connectionManager ); + httpClient.setHostConfiguration( hostConfiguration ); + } + + /** + * Gets resource object from the live web. Configure timeout to 10 seconds. + * + * @param url to fetch from the live web. + * @param maxCacheMS maximum age of resource to return - optionally honored + * @param bUseOlder if true, return documents older than maxCacheMS if + * a more recent copy is not available. + * + * @return Resource for url + * + * @throws LiveDocumentNotAvailableException if the resource cannot be + * retrieved from the live web, but all proxying and caching + * mechanisms functioned properly + * @throws LiveWebCacheUnavailableException if there was a problem either + * accessing the live web, in proxying to the live web, or in + * maintaining the cache for the live web + * @throws LiveWebTimeoutException if there is no response from the live + * web cache before a timeout occurred. + * @throws IOException for the usual reasons + * + * @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean) + * @inheritDoc org.archive.wayback.liveweb.LiveWebCache#getCachedResource + */ + @Override + public Resource getCachedResource( URL url, long maxCacheMS, + boolean bUseOlder ) + throws LiveDocumentNotAvailableException, + LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException + { + String urlStr = url.toExternalForm(); + + if (requestPrefix != null) + urlStr = requestPrefix + urlStr; + + HttpHost proxy = new HttpHost( hostConfiguration.getProxyHost(), + hostConfiguration.getProxyPort() ); + + // Set socketTimeout and connectionTimeout to 10 seconds. + RequestConfig reqConf = RequestConfig.custom().setProxy( proxy ) + .setSocketTimeout( 10000 ) + .setConnectTimeout( 10000 ) + .setConnectionRequestTimeout( 10000 ) + .build(); + CloseableHttpClient httpclient = HttpClients.custom(). + setDefaultRequestConfig(reqConf).build(); + HttpGet httpGet = new HttpGet( urlStr ); + + try + { + // The following line gets robots.txt from live web + response= httpclient.execute( httpGet ); + + String httpHeaderStr = ""; + String bodyStr = ""; + + /* If it fails to get robots.txt (http status code is 404), + then display contents and don't throw exception + (socketTimeOutException or connectTimeOutException) + */ + if ( response.getStatusLine().getStatusCode() == 404 ) + { + httpHeaderStr = "HTTP/1.0 200 OK\n"; + bodyStr = String.format( "%s\n%s\n", + "User-agent: *", "Allow: /" ); + } + else if ( response.getStatusLine().getStatusCode() == 200 ) + { + // The following line represents first line in http header + httpHeaderStr = String.format( "%s %d %s\n", + response.getStatusLine().getProtocolVersion(), + response.getStatusLine().getStatusCode(), + response.getStatusLine().getReasonPhrase() ); + + // Get robots.txt contents and store it into bodyStr + HttpEntity entity = response.getEntity(); + bodyStr = EntityUtils.toString(entity); + } + + // Get Http Header and store complete http header in httpHeaderStr + for ( Header header : response.getAllHeaders() ) + httpHeaderStr += header.toString() + "\n"; + + httpHeaderStr += "\n"; + int length = httpHeaderStr.length() + bodyStr.length(); + + /* + Using httpHeaderStr and bodyStr to construct responseStr. + First line in responseStr should exist. + */ + + // TODO: the following line should be enhanced, + // especially the first line in responseStr. + String responseStr = String.format( "%s %s %d\n%s%s", urlStr, + "0.0.0.0 10000000000000 text/plain", length, + httpHeaderStr, bodyStr ); + + ByteArrayInputStream bais = new ByteArrayInputStream( + responseStr.getBytes() ); + + // TODO: Should not use ARCRecord + ARCRecord r = new ARCRecord( bais, "id", 0L, false, false, true ); + ar = ( ArcResource ) ResourceFactory.ARCArchiveRecordToResource( r, null ); + + if ( ar.getStatusCode() == 502 ) + { + throw new LiveDocumentNotAvailableException( urlStr ); + } + else if ( ar.getStatusCode() == 504 ) + { + throw new LiveWebTimeoutException( "Timeout:" + urlStr ); + } + + return ar; + } + catch( ResourceNotAvailableException e ) + { + throw new LiveDocumentNotAvailableException( urlStr ); + } + catch( NoHttpResponseException e ) + { + throw new LiveWebCacheUnavailableException( "No Http Response for " + + urlStr ); + } + catch( ConnectException e ) + { + throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() + + " : " + urlStr ); + } + catch ( SocketException e ) + { + throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() + + " : " + urlStr ); + } + catch ( SocketTimeoutException e ) + { + throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " + + urlStr ); + } + catch( ConnectTimeoutException e ) + { + throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " + + urlStr ); + } + finally + { + response.close(); + } + } + + /** + * Sets proxy and port (proxy:port). + * + * @param hostPort to proxy requests through - ex. "localhost:3128" + */ + public void setProxyHostPort( String hostPort ) + { + int colonIdx = hostPort.indexOf( ':' ); + if(colonIdx > 0) + { + String host = hostPort.substring( 0,colonIdx ); + int port = Integer.valueOf( hostPort.substring( colonIdx+1 ) ); + hostConfiguration.setProxy( host, port ); + } + } + + /** + * + * @see org.archive.wayback.liveweb.LiveWebCache#shutdown() + */ + @Override + public void shutdown() + { + throw new UnsupportedOperationException( "Not supported yet." ); //To change body of generated methods, choose Tools | Templates. + } +} \ No newline at end of file diff --git a/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml b/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml index cec9b1a696..ea89f6c30b 100644 --- a/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml +++ b/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml @@ -30,12 +30,15 @@ + class="org.archive.wayback.liveweb.ArcRemoteLiveWebCache">