Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Diff Issue #227

Closed
VinothKanna007 opened this issue Jan 26, 2024 · 4 comments
Closed

Diff Issue #227

VinothKanna007 opened this issue Jan 26, 2024 · 4 comments

Comments

@VinothKanna007
Copy link

VinothKanna007 commented Jan 26, 2024

Error:

AttributeError: 'DiffOptions' object has no attribute '_get_object_id'

Steps to recreate an issue:

Run the below code:

# required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import datetime
from pyspark.sql.types import StructType
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
from pyspark.sql.types import *

import os
import sys
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

my_conf = SparkConf()
my_conf.set("spark.app.name", "w12_Spark_1")
my_conf.set("spark.master", "local[*]")
# my_conf.set("spark.driver.extraClassPath", "C:/Users/Vinoth/Downloads/sqljdbc_12.4.2.0_enu/sqljdbc_12.4/enu/jars/mssql-jdbc-12.4.2.jre8.jar")
my_conf.set('spark.jars.packages', 'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5')

spark = SparkSession.builder.config(conf = my_conf).getOrCreate()
spark

from gresearch.spark.diff import *

left = spark.createDataFrame([(1, 1.0), (2, 2.0), (3, 3.0)]).toDF("id", "value")
right = spark.createDataFrame([(1, 1.0), (2, 2.02), (3, 3.05)]).toDF("id", "value")
left.diff(right, "id").show()

options = DiffOptions() \
  .with_diff_column("d") \
  .with_left_column_prefix("l") \
  .with_right_column_prefix("r") \
  .with_insert_diff_value("i") \
  .with_change_diff_value("c") \
  .with_delete_diff_value("d") \
  .with_nochange_diff_value("n") \
  .with_change_column("changes") \
  .with_diff_mode(DiffMode.Default) \
  .with_default_comparator(DiffComparators.epsilon(0.01)) \
  .with_data_type_comparator(DiffComparators.epsilon(0.001), DoubleType()) \
  .with_column_name_comparator(DiffComparators.epsilon(0.001), "float_column")



left.diff(right, options, "id").show()
@EnricoMi
Copy link
Contributor

EnricoMi commented Jan 26, 2024

Which PySpark version are you using? Can I get the full stacktrace / traceback?

@VinothKanna007
Copy link
Author

FYI

image

Full StackTrace:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[7], line 1
----> 1 left.diff(right, options, "id").show()

File ~\AppData\Local\Temp\spark-e2e8987d-bcf6-435b-8d3f-c0ff403989df\userFiles-18025021-b073-445a-aa43-ed482f69353f\uk.co.gresearch.spark_spark-extension_2.12-2.11.0-3.5.jar\gresearch\spark\diff\__init__.py:427, in diff(self, other, *id_columns)
    367 def diff(self: DataFrame, other: DataFrame, *id_columns: str) -> DataFrame:
    368     """
    369     Returns a new DataFrame that contains the differences between this and the other DataFrame.
    370     Both DataFrames must contain the same set of column names and data types.
   (...)
    425     :rtype DataFrame
    426     """
--> 427     return Differ().diff(self, other, *id_columns)

File ~\AppData\Local\Temp\spark-e2e8987d-bcf6-435b-8d3f-c0ff403989df\userFiles-18025021-b073-445a-aa43-ed482f69353f\uk.co.gresearch.spark_spark-extension_2.12-2.11.0-3.5.jar\gresearch\spark\diff\__init__.py:338, in Differ.diff(self, left, right, *id_columns)
    336 jvm = left._sc._jvm
    337 jdiffer = self._to_java(jvm)
--> 338 jdf = jdiffer.diff(left._jdf, right._jdf, _to_seq(jvm, list(id_columns)))
    339 return DataFrame(jdf, left.session_or_ctx())

File ~\AppData\Local\Temp\spark-e2e8987d-bcf6-435b-8d3f-c0ff403989df\userFiles-18025021-b073-445a-aa43-ed482f69353f\uk.co.gresearch.spark_spark-extension_2.12-2.11.0-3.5.jar\gresearch\spark\__init__.py:41, in _to_seq(jvm, list)
     40 def _to_seq(jvm: JVMView, list: List[Any]) -> JavaObject:
---> 41     array = jvm.java.util.ArrayList(list)
     42     return jvm.scala.collection.JavaConverters.asScalaIteratorConverter(array.iterator()).asScala().toSeq()

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1573, in JavaClass.__call__(self, *args)
   1570 def __call__(self, *args):
   1571     # TODO Refactor to use a mixin shared by JavaMember and JavaClass
   1572     if self._converters is not None and len(self._converters) > 0:
-> 1573         (new_args, temp_args) = self._get_args(args)
   1574     else:
   1575         new_args = args

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1559, in JavaClass._get_args(self, args)
   1557 for converter in self._converters:
   1558     if converter.can_convert(arg):
-> 1559         temp_arg = converter.convert(arg, self._gateway_client)
   1560         temp_args.append(temp_arg)
   1561         new_args.append(temp_arg)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_collections.py:511, in ListConverter.convert(self, object, gateway_client)
    509 java_list = ArrayList()
    510 for element in object:
--> 511     java_list.add(element)
    512 return java_list

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1314, in JavaMember.__call__(self, *args)
   1313 def __call__(self, *args):
-> 1314     args_command, temp_args = self._build_args(*args)
   1316     command = proto.CALL_COMMAND_NAME +\
   1317         self.command_header +\
   1318         args_command +\
   1319         proto.END_COMMAND_PART
   1321     answer = self.gateway_client.send_command(command)

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1283, in JavaMember._build_args(self, *args)
   1279     new_args = args
   1280     temp_args = []
   1282 args_command = "".join(
-> 1283     [get_command_part(arg, self.pool) for arg in new_args])
   1285 return args_command, temp_args

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1283, in <listcomp>(.0)
   1279     new_args = args
   1280     temp_args = []
   1282 args_command = "".join(
-> 1283     [get_command_part(arg, self.pool) for arg in new_args])
   1285 return args_command, temp_args

File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\protocol.py:298, in get_command_part(parameter, python_proxy_pool)
    296         command_part += ";" + interface
    297 else:
--> 298     command_part = REFERENCE_TYPE + parameter._get_object_id()
    300 command_part += "\n"
    302 return command_part

AttributeError: 'DiffOptions' object has no attribute '_get_object_id'

@EnricoMi
Copy link
Contributor

Now I got it, left.diff(right, ...) only takes column names.

If you want to provide DiffOptions, then you need to use left.diff_with_options(right, options, ...):

left.diff_with_options(right, options, "id").show()

I think the method should check its input types and provide some useful error message.

@VinothKanna007
Copy link
Author

Thanks Its working. Appreciate it!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants