diff --git a/notebooks/goea_nbt3102.ipynb b/notebooks/goea_nbt3102.ipynb index a8f4f31..6f7c956 100755 --- a/notebooks/goea_nbt3102.ipynb +++ b/notebooks/goea_nbt3102.ipynb @@ -11,7 +11,7 @@ "subpopulations of cells\n", "](http://www.nature.com/nbt/journal/v33/n2/full/nbt.3102.html#methods)\n", "\n", - "Note: you must have the Python package, **xlrd**, installed to run this example. \n", + "Note: you must have the Python packages, **xlrd**, **pandas** and **openpyxl**, installed to run this example. \n", "\n", "Note: To create plots, you must have:\n", " * Python packages: **pyparsing**, **pydot**\n", @@ -160,9 +160,7 @@ "metadata": {}, "source": [ "### 2c. Load Background gene set\n", - "In this example, the background is all mouse protein-codinge genes. \n", - "\n", - "Follow the instructions in the `background_genes_ncbi` notebook to download a set of background population genes from NCBI." + "In this example, the background is all mouse protein-codinge genes.", ] }, { @@ -179,7 +177,7 @@ } ], "source": [ - "from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus\n", + "from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus\n", "print(len(GeneID2nt_mus))" ] }, @@ -251,21 +249,22 @@ } ], "source": [ - "# Data will be stored in this variable\n", "import os\n", - "geneid2symbol = {}\n", + "import goatools\n", "# Get xlsx filename where data is stored\n", - "ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory\n", - "din_xlsx = os.path.join(ROOT, \"goatools/test_data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx\")\n", + "ROOT = os.path.dirname(goatools.__file__) # goatool root directory\n", + "din_xlsx = os.path.join(ROOT, \"test_data\", \"nbt_3102\", \"nbt.3102-S4_GeneIDs.xlsx\")\n", "# Read data\n", "if os.path.isfile(din_xlsx): \n", - " import xlrd\n", - " book = xlrd.open_workbook(din_xlsx)\n", - " pg = book.sheet_by_index(0)\n", - " for r in range(pg.nrows):\n", - " symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]\n", - " if geneid:\n", - " geneid2symbol[int(geneid)] = symbol\n", + " import pandas as pd\n", + " df = pd.read_excel(\n", + " din_xlsx,\n", + " header=None, \n", + " names=[\"symbol\", \"geneid\", \"pval\"], \n", + " dtype={\"symbol\": str, \"geneid\": int, \"pval\": float},\n", + " index_col=1,\n", + " ) # requires openpyxl\n", + " geneid2symbol = df[\"symbol\"].to_dict()\n", " print('{N} genes READ: {XLSX}'.format(N=len(geneid2symbol), XLSX=din_xlsx))\n", "else:\n", " raise RuntimeError('FILE NOT FOUND: {XLSX}'.format(XLSX=din_xlsx))"