xgi-org · nwlandry · Mar 13, 2024 · Mar 9, 2024 · Mar 9, 2024 · Mar 9, 2024
diff --git a/docs/source/api/recipes/recipes.ipynb b/docs/source/api/recipes/recipes.ipynb
@@ -15,6 +15,7 @@
    "outputs": [],
    "source": [
     "import networkx as nx\n",
+    "\n",
     "import xgi"
    ]
   },
@@ -52,10 +53,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import xgi\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
-    "from numpy.linalg import eigh"
+    "from numpy.linalg import eigh\n",
+    "\n",
+    "import xgi"
    ]
   },
   {
@@ -112,9 +114,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import xgi\n",
     "from itertools import permutations\n",
     "\n",
+    "import xgi\n",
+    "\n",
     "\n",
     "def adjacency_tensor(H, order):\n",
     "    N = H.num_nodes\n",
@@ -411,10 +414,11 @@
     }
    ],
    "source": [
-    "import xgi\n",
     "import pandas as pd\n",
     "import seaborn as sns\n",
     "\n",
+    "import xgi\n",
+    "\n",
     "H = xgi.load_xgi_data(\"diseasome\")\n",
     "H.merge_duplicate_edges(rename=\"tuple\", multiplicity=\"weight\")\n",
     "\n",
@@ -456,11 +460,11 @@
     }
    ],
    "source": [
-    "import xgi\n",
     "import matplotlib.pyplot as plt\n",
     "import pandas as pd\n",
     "import seaborn as sns\n",
     "\n",
+    "import xgi\n",
     "\n",
     "H = xgi.load_xgi_data(\"diseasome\")\n",
     "df1 = H.nodes.degree.ashist(bin_edges=True)\n",
@@ -526,8 +530,8 @@
      "data": {
       "text/plain": [
        "(<Axes3DSubplot: >,\n",
-       " (<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x28c34b4d0>,\n",
-       "  <mpl_toolkits.mplot3d.art3d.Poly3DCollection at 0x28d886110>))"
+       " (<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x28cae3ad0>,\n",
+       "  <mpl_toolkits.mplot3d.art3d.Poly3DCollection at 0x28caaaa50>))"
       ]
      },
      "execution_count": 20,
@@ -546,9 +550,10 @@
     }
    ],
    "source": [
-    "import xgi\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
+    "import xgi\n",
+    "\n",
     "H = xgi.random_hypergraph(N=10, ps=[0.2, 0.05, 0.05], seed=1)\n",
     "\n",
     "_, ax = plt.subplots(figsize=(4, 4), subplot_kw={\"projection\": \"3d\"})\n",
@@ -711,6 +716,7 @@
    ],
    "source": [
     "import networkx as nx\n",
+    "\n",
     "import xgi\n",
     "\n",
     "m, n = 10, 20\n",
@@ -755,9 +761,10 @@
     }
    ],
    "source": [
-    "import xgi\n",
     "import numpy as np\n",
     "\n",
+    "import xgi\n",
+    "\n",
     "H = xgi.random_hypergraph(N=100, ps=[0.2, 0.02], seed=1)\n",
     "N = H.num_nodes\n",
     "spl = xgi.shortest_path_length(H)\n",
@@ -835,6 +842,91 @@
     "matching_ids = [k for k, v in H.degree().items() if v == d]\n",
     "print(f\"Nodes {', '.join(matching_ids)} have degree {d}\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 18. Define a custom filtering function\n",
+    "\n",
+    "In addition to the pre-defined filtering functionality, one can also define a custom comparison operator to compare statistics and attributes.\n",
+    "\n",
+    "First, we show an example for numerical statistics and second, an example for attributes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Numerical statistics**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The total number of nodes is 148\n",
+      "The number of nodes with degree less than 3 or greater than 20 is 128\n"
+     ]
+    }
+   ],
+   "source": [
+    "import xgi\n",
+    "\n",
+    "outsiderange = lambda val, arg: arg[0] > val or val > arg[1]\n",
+    "\n",
+    "H = xgi.load_xgi_data(\"email-enron\")\n",
+    "print(f\"The total number of nodes is {H.num_nodes}\")\n",
+    "\n",
+    "# Get all of nodes that have degree less than 3 or greater than 20\n",
+    "nodes = H.nodes.filterby(\"degree\", [3, 20], mode=outsiderange)\n",
+    "print(f\"The number of nodes with degree less than 3 or greater than 20 is {len(nodes)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Attributes**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The total number of hyperedges is 10885\n",
+      "The number of hyperedges between 01JAN2000 and 01JAN2001 is 3992\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datetime\n",
+    "\n",
+    "import xgi\n",
+    "\n",
+    "date1 = datetime.datetime(2000, 1, 1)\n",
+    "date2 = datetime.datetime(2001, 1, 1)\n",
+    "datecompare = (\n",
+    "    lambda date, arg: arg[0] <= datetime.datetime.fromisoformat(date) <= arg[1]\n",
+    ")\n",
+    "\n",
+    "H = xgi.load_xgi_data(\"email-enron\")\n",
+    "print(f\"The total number of hyperedges is {H.num_edges}\")\n",
+    "\n",
+    "# Get all of the dates between 01JAN2000 and 01JAN2001\n",
+    "e = H.edges.filterby_attr(\"timestamp\", [date1, date2], mode=datecompare)\n",
+    "print(f\"The number of hyperedges between 01JAN2000 and 01JAN2001 is {len(e)}\")"
+   ]
   }
  ],
  "metadata": {

diff --git a/tests/core/test_diviews.py b/tests/core/test_diviews.py
@@ -221,3 +221,15 @@ def test_isolates():
     assert set(DH.nodes.isolates()) == {0, 1, 2, 3}
     DH.add_edge([{0}, {1, 2}])
     assert set(DH.nodes.isolates()) == {3}
+
+
+def test_diview_custom_filterby(diedgelist2):
+    H = xgi.DiHypergraph(diedgelist2)
+
+    f = lambda val, arg: val % arg == 0
+    assert set(H.edges.filterby("tail_size", 2, mode=f)) == {0, 1}
+
+
+def test_diview_custom_filterby_attr(dihyperwithattrs):
+    f = lambda val, arg: arg in val
+    assert set(dihyperwithattrs.nodes.filterby_attr("color", "l", mode=f)) == {2, 3, 5}
diff --git a/tests/core/test_views.py b/tests/core/test_views.py
@@ -346,3 +346,15 @@ def test_ids_are_immutable(edgelist5):
     H = xgi.Hypergraph(edgelist5)
     H.edges.ids.add(42)
     assert H.edges.ids == {0, 1, 2, 3}
+
+
+def test_view_custom_filterby(edgelist8):
+    H = xgi.Hypergraph(edgelist8)
+
+    f = lambda val, arg: val <= arg**2
+    assert set(H.nodes.filterby("degree", 2, mode=f)) == {2, 3, 4, 5, 6}
+
+
+def test_view_custom_filterby_attr(hyperwithattrs):
+    f = lambda val, arg: arg in val
+    assert set(hyperwithattrs.nodes.filterby_attr("color", "l", mode=f)) == {2, 3, 5}
diff --git a/xgi/core/diviews.py b/xgi/core/diviews.py
@@ -190,7 +190,7 @@ def filterby(self, stat, val, mode="eq"):
         val : Any
             Value of the statistic.  Usually a single numeric value.  When mode is
             'between', must be a tuple of exactly two values.
-        mode : str, optional
+        mode : str or function, optional
             How to compare each value to `val`.  Can be one of the following.
 
             * 'eq' (default): Return IDs whose value is exactly equal to `val`.
@@ -201,6 +201,7 @@ def filterby(self, stat, val, mode="eq"):
             * 'geq': Return IDs whose value is greater than or equal to `val`.
             * 'between': In this mode, `val` must be a tuple `(val1, val2)`.  Return IDs
               whose value `v` satisfies `val1 <= v <= val2`.
+            * function, must be able to call `mode(statistic, val)` and have it map to a bool.
 
         See Also
         --------
@@ -256,6 +257,8 @@ def filterby(self, stat, val, mode="eq"):
             bunch = [idx for idx in self if values[idx] >= val]
         elif mode == "between":
             bunch = [node for node in self if val[0] <= values[node] <= val[1]]
+        elif callable(mode):
+            bunch = [idx for idx in self if mode(values[idx], val)]
         else:
             raise ValueError(
                 f"Unrecognized mode {mode}. mode must be one of 'eq', 'neq', 'lt', 'gt', 'leq', 'geq', or 'between'."
@@ -271,9 +274,10 @@ def filterby_attr(self, attr, val, mode="eq", missing=None):
             The name of the attribute
         val : Any
             A single value or, in the case of 'between', a list of length 2
-        mode : str, optional
+        mode : str or function, optional
             Comparison mode. Valid options are 'eq' (default), 'neq', 'lt', 'gt',
-            'leq', 'geq', or 'between'.
+            'leq', 'geq', or 'between'. If a function, must be able to call
+            `mode(attribute, val)` and have it map to a bool.
         missing : Any, optional
             The default value if the attribute is missing. If None (default),
             ignores those IDs.
@@ -323,9 +327,15 @@ def filterby_attr(self, attr, val, mode="eq", missing=None):
                 for idx in self
                 if values[idx] is not None and val[0] <= values[idx] <= val[1]
             ]
+        elif callable(mode):
+            bunch = [
+                idx
+                for idx in self
+                if values[idx] is not None and mode(values[idx], val)
+            ]
         else:
             raise ValueError(
-                f"Unrecognized mode {mode}. mode must be one of 'eq', 'neq', 'lt', 'gt', 'leq', 'geq', or 'between'."
+                f"Unrecognized mode {mode}. mode must be one of 'eq', 'neq', 'lt', 'gt', 'leq', 'geq', 'between', or a callable function."
             )
         return type(self).from_view(self, bunch)
 

diff --git a/xgi/core/views.py b/xgi/core/views.py
@@ -183,7 +183,7 @@ def filterby(self, stat, val, mode="eq"):
         val : Any
             Value of the statistic.  Usually a single numeric value.  When mode is
             'between', must be a tuple of exactly two values.
-        mode : str, optional
+        mode : str or function, optional
             How to compare each value to `val`.  Can be one of the following.
 
             * 'eq' (default): Return IDs whose value is exactly equal to `val`.
@@ -194,6 +194,7 @@ def filterby(self, stat, val, mode="eq"):
             * 'geq': Return IDs whose value is greater than or equal to `val`.
             * 'between': In this mode, `val` must be a tuple `(val1, val2)`.  Return IDs
               whose value `v` satisfies `val1 <= v <= val2`.
+            * function, must be able to call `mode(statistic, val)` and have it map to a bool.
 
         See Also
         --------
@@ -254,7 +255,9 @@ def filterby(self, stat, val, mode="eq"):
         elif mode == "geq":
             bunch = [idx for idx in self if values[idx] >= val]
         elif mode == "between":
-            bunch = [idx for idx in self if val[0] <= values[idx] <= val[1]]
+            bunch = [node for node in self if val[0] <= values[node] <= val[1]]
+        elif callable(mode):
+            bunch = [idx for idx in self if mode(values[idx], val)]
         else:
             raise ValueError(
                 f"Unrecognized mode {mode}. mode must be one of "
@@ -271,9 +274,10 @@ def filterby_attr(self, attr, val, mode="eq", missing=None):
             The name of the attribute
         val : Any
             A single value or, in the case of 'between', a list of length 2
-        mode : str, optional
+        mode : str or function, optional
             Comparison mode. Valid options are 'eq' (default), 'neq', 'lt', 'gt',
-            'leq', 'geq', or 'between'.
+            'leq', 'geq', or 'between'. If a function, must be able to call
+            `mode(attribute, val)` and have it map to a bool.
         missing : Any, optional
             The default value if the attribute is missing. If None (default),
             ignores those IDs.
@@ -323,6 +327,12 @@ def filterby_attr(self, attr, val, mode="eq", missing=None):
                 for idx in self
                 if values[idx] is not None and val[0] <= values[idx] <= val[1]
             ]
+        elif callable(mode):
+            bunch = [
+                idx
+                for idx in self
+                if values[idx] is not None and mode(values[idx], val)
+            ]
         else:
             raise ValueError(
                 f"Unrecognized mode {mode}. mode must be one of "