diff --git a/Cats.html b/Cats.html index c9283316..2a88e5bd 100644 --- a/Cats.html +++ b/Cats.html @@ -166,7 +166,7 @@

count <- typedDs.count[Action]() } yield (sample, count) // result: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$Lambda$11673/0x0000000803398040@529321b4 +// cats.data.Kleisli$$Lambda$11614/0x000000080330d840@157d8902 // )

As with Job, note that nothing has been run yet. The effect has been properly suspended. To run our program, we must first supply the SparkSession to the ReaderT layer and then @@ -193,7 +193,7 @@

yield r // resultWithDescription: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$$Lambda$13094/0x0000000803885040@63b314e7 +// cats.data.Kleisli$$$Lambda$13124/0x00000008038d2840@2fa15da1 // ) resultWithDescription.run(spark).unsafeRunSync() diff --git a/FeatureOverview.html b/FeatureOverview.html index fa30ecab..2dce7930 100644 --- a/FeatureOverview.html +++ b/FeatureOverview.html @@ -716,7 +716,7 @@

// priceModifier: (String, Double) => Double = <function2> val udf = aptTypedDs.makeUDF(priceModifier) -// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$14191/0x0000000803ce1840@7c973009 +// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$14170/0x0000000803caf040@200c7081 val aptds = aptTypedDs // For shorter expressions // aptds: TypedDataset[Apartment] = [city: string, surface: int ... 2 more fields] diff --git a/Injection.html b/Injection.html index dcaf4c97..23805139 100644 --- a/Injection.html +++ b/Injection.html @@ -144,7 +144,7 @@

// people: Seq[Person] = List( // Person( // 42, -// java.util.GregorianCalendar[time=1718499806448,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=5,WEEK_OF_YEAR=25,WEEK_OF_MONTH=4,DAY_OF_MONTH=16,DAY_OF_YEAR=168,DAY_OF_WEEK=1,DAY_OF_WEEK_IN_MONTH=3,AM_PM=0,HOUR=1,HOUR_OF_DAY=1,MINUTE=3,SECOND=26,MILLISECOND=448,ZONE_OFFSET=0,DST_OFFSET=0] +// java.util.GregorianCalendar[time=1719924507663,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=6,WEEK_OF_YEAR=27,WEEK_OF_MONTH=1,DAY_OF_MONTH=2,DAY_OF_YEAR=184,DAY_OF_WEEK=3,DAY_OF_WEEK_IN_MONTH=1,AM_PM=1,HOUR=0,HOUR_OF_DAY=12,MINUTE=48,SECOND=27,MILLISECOND=663,ZONE_OFFSET=0,DST_OFFSET=0] // ) // )

And an instance of a TypedDataset:

@@ -167,7 +167,7 @@

cal } } -// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@10f4a4d6 +// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@4ba9ca16

We can be less verbose using the Injection.apply function:

import frameless._
 
@@ -180,7 +180,7 @@ 

cal.setTime(new java.util.Date(l)) cal }) -// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@5ad19282

+// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@4bc98c72

Now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, birthday: bigint]
@@ -214,7 +214,7 @@

case 2 => Female case 3 => Other }) -// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@583e4aa +// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@40b834de

And now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, gender: int]
diff --git a/Job.html b/Job.html index 465c6f69..a36ecc7a 100644 --- a/Job.html +++ b/Job.html @@ -156,7 +156,7 @@

Job[A]

count <- ds.count() sample <- ds.take((count/5).toInt) } yield sample -// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@5a507fe4 +// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@1bbad6ea countAndTakeJob.run() // res1: Seq[Int] = WrappedArray(1, 2, 3, 4) @@ -167,7 +167,7 @@

Job[A]

def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min) val finalJob = computeMinOfSample(countAndTakeJob) -// finalJob: Job[Int] = frameless.Job$$anon$2@6ecc4c82 +// finalJob: Job[Int] = frameless.Job$$anon$2@42a483cd

Now we can execute this new job by specifying a group-id and a description. This allows the programmer to see this information on the Spark UI and help track, say, performance issues.

diff --git a/TypedDatasetVsSparkDataset.html b/TypedDatasetVsSparkDataset.html index bbfc8428..fa947c75 100644 --- a/TypedDatasetVsSparkDataset.html +++ b/TypedDatasetVsSparkDataset.html @@ -159,9 +159,9 @@

Comparing T // +---+---+ // | i| j| // +---+---+ +// | 1| Q| // | 10| W| // |100| E| -// | 1| Q| // +---+---+ //

The value ds holds the content of the initialDs read from a parquet file. diff --git a/TypedEncoder.html b/TypedEncoder.html index c62ac83a..bfce99c1 100644 --- a/TypedEncoder.html +++ b/TypedEncoder.html @@ -212,7 +212,7 @@

Typed Encoders in Frameless// ds: TypedDataset[Foo] = [i: int, b: struct<d: double, s: string>] ds.collect() -// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@33820cc8 +// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@572adb

But any non-encodable in the case class hierarchy will be detected at compile time:

case class BarDate(d: Double, s: String, t: java.util.Calendar)
 case class FooDate(i: Int, b: BarDate)
diff --git a/TypedML.html b/TypedML.html index 15059060..23058784 100644 --- a/TypedML.html +++ b/TypedML.html @@ -176,7 +176,7 @@

case class Features(squareFeet: Double, hasGarden: Boolean) val assembler = TypedVectorAssembler[Features] -// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@25721902 +// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@686bbdb9 case class HouseDataWithFeatures(squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector) val trainingDataWithFeatures = assembler.transform(trainingData).as[HouseDataWithFeatures] @@ -212,10 +212,10 @@

case class RFInputs(price: Double, features: Vector) val rf = TypedRandomForestRegressor[RFInputs] -// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@49c125ff +// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@4197f8a4 val model = rf.fit(trainingDataWithFeatures).run() -// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@2a38506d +// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@737e204

TypedRandomForestRegressor[RFInputs] compiles only if RFInputs contains only one field of type Double (the label) and one field of type Vector (the features):

case class WrongRFInputs(labelOfWrongType: String, features: Vector)
@@ -281,7 +281,7 @@

case class Features(price: Double, squareFeet: Double) val vectorAssembler = TypedVectorAssembler[Features] -// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@534bd407 +// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@3987af2c case class HouseDataWithFeatures(squareFeet: Double, city: String, price: Double, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingData).as[HouseDataWithFeatures] @@ -289,11 +289,11 @@

case class StringIndexerInput(city: String) val indexer = TypedStringIndexer[StringIndexerInput] -// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@55723184 +// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@74e177e8 indexer.estimator.setHandleInvalid("keep") -// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_7b9dc017fbaa +// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_6a821904c6cc val indexerModel = indexer.fit(dataWithFeatures).run() -// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@7e025b20 +// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@4d305bee case class HouseDataWithFeaturesAndIndex( squareFeet: Double, @@ -307,10 +307,10 @@

case class RFInputs(cityIndexed: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] -// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@1919d203 +// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@6763ffd9 val model = rf.fit(indexedData).run() -// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@14ffd6cb +// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@6399c2d6

Prediction

We now want to predict city for testData using the previously trained model. Like the Spark ML API, @@ -342,7 +342,7 @@

case class IndexToStringInput(predictedCityIndexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels) -// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@50c53bc1 +// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@1c38a77a case class HouseCityPrediction( features: Vector,