diff --git a/examples/campaign-finance.ipynb b/examples/campaign-finance.ipynb index d111c96..082ad5a 100644 --- a/examples/campaign-finance.ipynb +++ b/examples/campaign-finance.ipynb @@ -242,7 +242,7 @@ " \"E\": \"recount\",\n", " }\n", " first_letter = pgi[0]\n", - " return first_letter.substitute(election_types, else_=ibis.NA)\n", + " return first_letter.substitute(election_types, else_=ibis.null())\n", "\n", "\n", "cleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop(\n", diff --git a/examples/imdb.ipynb b/examples/imdb.ipynb index 5428449..8af82ce 100644 --- a/examples/imdb.ipynb +++ b/examples/imdb.ipynb @@ -93,7 +93,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To ensure column names are Pythonic, we can relabel as `snake_case`." + "To ensure column names are Pythonic, we can rename as `snake_case`." ] }, { @@ -102,7 +102,7 @@ "metadata": {}, "outputs": [], "source": [ - "name_basics.relabel(\"snake_case\")" + "name_basics.rename(\"snake_case\")" ] }, { @@ -110,7 +110,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's grab all of the relevant IMDB tables and relabel columns." + "Let's grab all of the relevant IMDB tables and rename columns." ] }, { @@ -119,13 +119,13 @@ "metadata": {}, "outputs": [], "source": [ - "name_basics = ex.imdb_name_basics.fetch().relabel(\"snake_case\")\n", - "title_akas = ex.imdb_title_akas.fetch().relabel(\"snake_case\")\n", - "title_basics = ex.imdb_title_basics.fetch().relabel(\"snake_case\")\n", - "title_crew = ex.imdb_title_crew.fetch().relabel(\"snake_case\")\n", - "title_episode = ex.imdb_title_episode.fetch().relabel(\"snake_case\")\n", - "title_principals = ex.imdb_title_principals.fetch().relabel(\"snake_case\")\n", - "title_ratings = ex.imdb_title_ratings.fetch().relabel(\"snake_case\")" + "name_basics = ex.imdb_name_basics.fetch().rename(\"snake_case\")\n", + "title_akas = ex.imdb_title_akas.fetch().rename(\"snake_case\")\n", + "title_basics = ex.imdb_title_basics.fetch().rename(\"snake_case\")\n", + "title_crew = ex.imdb_title_crew.fetch().rename(\"snake_case\")\n", + "title_episode = ex.imdb_title_episode.fetch().rename(\"snake_case\")\n", + "title_principals = ex.imdb_title_principals.fetch().rename(\"snake_case\")\n", + "title_ratings = ex.imdb_title_ratings.fetch().rename(\"snake_case\")" ] }, { @@ -420,7 +420,7 @@ "metadata": {}, "outputs": [], "source": [ - "ibis.show_sql(name_basics)" + "ibis.to_sql(name_basics)" ] }, { @@ -437,8 +437,8 @@ "metadata": {}, "outputs": [], "source": [ - "title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).relabel(\n", - " {\"title_id\": \"tconst\"}\n", + "title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).rename(\n", + " {\"tconst\": \"title_id\"}\n", ")\n", "title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst))\n", "title_crew = title_crew.mutate(\n", diff --git a/requirements.txt b/requirements.txt index 4f3a0b3..1a0daba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,5 @@ jupyterlab == 3.4.8 ipywidgets altair pandas < 2.1 -ibis-framework[sqlite,duckdb,clickhouse] +ibis-framework[sqlite,duckdb,clickhouse,examples] ibis-substrait < 3.1 diff --git a/scripts/prepare_campaign_finance_data.py b/scripts/prepare_campaign_finance_data.py index 8130a91..f966fc6 100644 --- a/scripts/prepare_campaign_finance_data.py +++ b/scripts/prepare_campaign_finance_data.py @@ -34,33 +34,34 @@ if not parquet_path.exists(): print("Generating itcont.parquet...") # Read in the CSV - t = ibis.read_csv(csv_path) - # The CSV doesn't have a header, we need to manually add titles - header = [ - "CMTE_ID", - "AMNDT_IND", - "RPT_TP", - "TRANSACTION_PGI", - "IMAGE_NUM", - "TRANSACTION_TP", - "ENTITY_TP", - "NAME", - "CITY", - "STATE", - "ZIP_CODE", - "EMPLOYER", - "OCCUPATION", - "TRANSACTION_DT", - "TRANSACTION_AMT", - "OTHER_ID", - "TRAN_ID", - "FILE_NUM", - "MEMO_CD", - "MEMO_TEXT", - "SUB_ID", - ] - t = t.relabel(dict(zip(t.columns, header))) + t = ibis.read_csv( + csv_path, + header=False, + names=[ + "CMTE_ID", + "AMNDT_IND", + "RPT_TP", + "TRANSACTION_PGI", + "IMAGE_NUM", + "TRANSACTION_TP", + "ENTITY_TP", + "NAME", + "CITY", + "STATE", + "ZIP_CODE", + "EMPLOYER", + "OCCUPATION", + "TRANSACTION_DT", + "TRANSACTION_AMT", + "OTHER_ID", + "TRAN_ID", + "FILE_NUM", + "MEMO_CD", + "MEMO_TEXT", + "SUB_ID", + ], + ) # For the analysis, we're only going to use a few of the columns. To save # bandwidth, lets select out only the columns we'll be using.