You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-26-dfd551e98564> in <module>()
11
12 local_pipeline.fit(X, y)
---> 13 dist_pipeline.fit(Z, clf__classes=np.unique(y))
14
15 y_pred_local = local_pipeline.predict(X)
/usr/local/lib/python3.5/site-packages/splearn/pipeline.py in fit(self, Z, **fit_params)
108 """
109 Zt, fit_params = self._pre_transform(Z, **fit_params)
--> 110 self.steps[-1][-1].fit(Zt, **fit_params)
111 return self
112
/usr/local/lib/python3.5/site-packages/splearn/svm/classes.py in fit(self, Z, classes)
117 check_rdd(Z, {'X': (sp.spmatrix, np.ndarray)})
118 self._classes_ = np.unique(classes)
--> 119 return self._spark_fit(SparkLinearSVC, Z)
120
121 def predict(self, X):
/usr/local/lib/python3.5/site-packages/splearn/linear_model/base.py in _spark_fit(self, cls, Z, *args, **kwargs)
82 )
83 models = Z.map(mapper)
---> 84 avg = models.sum() / models.count()
85 self.__dict__.update(avg.__dict__)
86 return self
/usr/local/lib/python3.5/site-packages/splearn/rdd.py in bypass(*args, **kwargs)
172 """
173 def bypass(*args, **kwargs):
--> 174 result = getattr(self._rdd, attr)(*args, **kwargs)
175 if isinstance(result, RDD):
176 if result is self._rdd:
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/rdd.py in sum(self)
993 6.0
994 """
--> 995 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
996
997 def count(self):
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/rdd.py in fold(self, zeroValue, op)
867 # zeroValue provided to each partition is unique from the one provided
868 # to the final reduce call
--> 869 vals = self.mapPartitions(func).collect()
870 return reduce(op, vals, zeroValue)
871
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 30.0 failed 1 times, most recent failure: Lost task 0.0 in stage 30.0 (TID 96, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/rdd.py", line 864, in func
acc = op(obj, acc)
File "/usr/local/lib/python3.5/site-packages/splearn/linear_model/base.py", line 27, in __add__
model.coef_ += other.coef_
AttributeError: 'int' object has no attribute 'coef_'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/local/Cellar/apache-spark/1.6.1/libexec/python/pyspark/rdd.py", line 864, in func
acc = op(obj, acc)
File "/usr/local/lib/python3.5/site-packages/splearn/linear_model/base.py", line 27, in __add__
model.coef_ += other.coef_
AttributeError: 'int' object has no attribute 'coef_'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Any idea of why this is happening and how to solve this issue?.
The text was updated successfully, but these errors were encountered:
From the the documentation, I tried a very simple classification pipeline:
Then:
Then I initialize both, a distribuited pipeline and a local one:
However, I got the following exception:
Any idea of why this is happening and how to solve this issue?.
The text was updated successfully, but these errors were encountered: