Skip to content

Commit

Permalink
Merge pull request #14 from p5k6/cdh5
Browse files Browse the repository at this point in the history
Upgrades to get Hiveswarm working on CDH5.
  • Loading branch information
p5k6 committed May 2, 2016
2 parents 9fe4384 + 8ccbf4c commit 8345130
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.markdown
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# HiveSwarm: User Defined Functions for Hive

[Hive](http://hive.apache.org/) provides a number of [useful user defined functions](http://wiki.apache.org/hadoop/Hive/LanguageManual/UDF), but there is certainly room for more. HiveSwarm provides a collection of additional useful functions.
HiveSwarm requires CDH4 running MRv1 (has not been tested with YARN)
HiveSwarm has been tested with mrv1 on cdh4 and YARN on cdh5

## Installation
Assuming you have Hadoop and Hive set up (along with your HADOOP_HOME and HIVE_HOME environment variables set correctly), run the following:
Expand Down
20 changes: 14 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
<include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
<include>org.jruby:jruby-complete</include>
<include>org.apache.commons:commons-math</include>
</includes>
</artifactSet>
</configuration>
Expand Down Expand Up @@ -77,39 +78,46 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.0.0-mr1-cdh4.2.1</version>
<version>2.3.0-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-serde</artifactId>
<version>0.10.0-cdh4.2.1</version>
<version>0.12.0-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.10.0-cdh4.2.1</version>
<version>0.12.0-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>2.0.0-mr1-cdh4.2.1</version>
<version>2.3.0-mr1-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.0.0-cdh4.2.1</version>
<version>2.3.0-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-common</artifactId>
<version>0.10.0-cdh4.2.1</version>
<version>0.12.0-cdh5.1.2</version>
<scope>provided</scope>
</dependency>
<!-- Apache Commons -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math</artifactId>
<version>2.2</version>
<scope>compile</scope>
</dependency>
<!-- Apache Lucene for NLP -->
<dependency>
<groupId>org.apache.lucene</groupId>
Expand Down
42 changes: 35 additions & 7 deletions src/main/java/com/livingsocial/hive/udf/UnixLiberalTimestamp.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package com.livingsocial.hive.udf;

import org.apache.hadoop.hive.ql.udf.UDFUnixTimeStamp;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.io.Text;

@Description(
Expand All @@ -12,10 +18,32 @@
extended = "Example:\n" +
" > SELECT a.* FROM srcpart a WHERE _FUNC_ (a.hr) < unix_timestamp() LIMIT 1;\n"
)
public class UnixLiberalTimestamp extends UDFUnixTimeStamp {
public LongWritable evaluate(Text datestring) {
if(datestring != null && datestring.find(" ") == -1)
datestring = new Text(datestring.toString() + " 00:00:00");
return super.evaluate(datestring);
public class UnixLiberalTimestamp extends GenericUDFUnixTimeStamp {

private transient StringObjectInspector stringOI;

public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 1) {
throw new UDFArgumentLengthException(getName().toUpperCase() + " only takes 1 arguments: String");
}
super.initialize(new ObjectInspector[]{arguments[0],
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING)});

stringOI = (StringObjectInspector) arguments[0];
return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
}

public Object evaluate(DeferredObject[] arguments) throws HiveException {
String datestring = stringOI.getPrimitiveJavaObject(arguments[0].get());
if (datestring == null) return null;
if (datestring.length() == 19) // timestamp
return super.evaluate(new DeferredObject[]{arguments[0],
new DeferredJavaObject("yyyy-MM-dd HH:mm:ss")});
else if (datestring.length() > 19) // timestamp with milliseconds
return super.evaluate(new DeferredObject[]{arguments[0],
new DeferredJavaObject("yyyy-MM-dd HH:mm:ss.S")});
else // date
return super.evaluate(new DeferredObject[]{arguments[0],
new DeferredJavaObject("yyyy-MM-dd")});
}
}
18 changes: 9 additions & 9 deletions src/main/java/com/livingsocial/hive/udf/ZTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,33 +19,33 @@
" Alternate format: p_value(critical_value). This skips the rest and just does a t-dist lookup"
)
public class ZTest extends UDF {

private static NormalDistribution distribution = new NormalDistributionImpl();

public double pval(double val){
try {
return 2 * (1 - distribution.cumulativeProbability(val));
} catch (MathException e) {
throw new RuntimeException(e);
}
}

private double criticalValue(double controlAvg, double controlStddev, long controlSize,
double treatmentAvg, double treatmentStddev, long treatmentSize) {
return Math.abs(treatmentAvg - controlAvg) / Math.sqrt(
(treatmentStddev*treatmentStddev/treatmentSize) +
(controlStddev*controlStddev/controlSize));
}

private double pval(final double controlAvg, final double controlStddev, final long controlSize,
final double treatmentAvg, final double treatmentStddev, final long treatmentSize) {

double critValue = criticalValue(controlAvg, controlStddev, controlSize,
double critValue = criticalValue(controlAvg, controlStddev, controlSize,
treatmentAvg, treatmentStddev, treatmentSize);
return pval(critValue);
}


public DoubleWritable evaluate(final DoubleWritable criticalValue) {
if (criticalValue == null) return null;
double val = criticalValue.get();
Expand All @@ -64,8 +64,8 @@ public DoubleWritable evaluate(final DoubleWritable controlAvg, final DoubleWrit
return null;
}

return new DoubleWritable(pval(controlAvg.get(), controlStddev.get(), controlSize.get(),
return new DoubleWritable(pval(controlAvg.get(), controlStddev.get(), controlSize.get(),
treatmentAvg.get(), treatmentStddev.get(), treatmentSize.get()));
}

}

0 comments on commit 8345130

Please sign in to comment.