Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-3032 Code for an ArbitraryIndexingFilter to index values resolved by user POJO code at index time #810

Merged
merged 8 commits into from
Apr 4, 2024
4 changes: 4 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
Expand Down Expand Up @@ -646,6 +647,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
Expand Down Expand Up @@ -1173,6 +1175,8 @@
<source path="${plugins.dir}/exchange-jexl/src/java/" />
<source path="${plugins.dir}/index-anchor/src/java/" />
<source path="${plugins.dir}/index-anchor/src/test/" />
<source path="${plugins.dir}/index-arbitrary/src/java/" />
<source path="${plugins.dir}/index-arbitrary/src/test/" />
<source path="${plugins.dir}/index-basic/src/java/" />
<source path="${plugins.dir}/index-basic/src/test/" />
<source path="${plugins.dir}/index-geoip/src/java/" />
Expand Down
66 changes: 66 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2252,6 +2252,72 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<!-- index-arbitrary plugin properties -->
<property>
<name>index.arbitrary.function.count</name>
<value></value>
<description>The count of arbitrary additions/edits to the document.
Specify the remaining properties (fieldName, className, constructorArgs,
methodName, and methodArgs) independently in this file by appending a
dot (.) followed by integer numerals (beginning with '0') to the property
names, e.g.:

index.arbitrary.fieldName.0
for the field to add/set with the first arbitrary addition or:

index.arbitrary.className.3
for the POJO class name to use in setting the fourth arbitrary addition.
</description>
</property>

<property>
<name>index.arbitrary.fieldName.0</name>
<value></value>
<description>The name of the field to add to the document with the value
returned from the custom POJO.</description>
</property>

<property>
<name>index.arbitrary.className.0</name>
<value></value>
<description>The fully qualified name of the POJO class that will supply
values for the new field.</description>
</property>

<property>
<name>index.arbitrary.constructorArgs.0</name>
<value></value>
<description>The values (as strings) to pass into the POJO constructor.
The POJO must accept a String representation of the NutchDocument's URL
as the first parameter in the constructor. The values you specify here
will populate the constructor arguments 1,..,n-1 where n=the count of
arguments to the constructor. Argument #0 will be the NutchDocument's URL.
</description>
</property>

<property>
<name>index.arbitrary.methodName.0</name>
<value></value>
<description>The name of the method to invoke on the instance of your custom
class in order to determine the value to add to the document.</description>
</property>

<property>
<name>index.arbitrary.methodArgs.0</name>
<value></value>
<description>The values (as strings) to pass into the named method on the POJO
instance. Unlike the constructor args, there is no required argument that this
method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any
arguments taken from the NutchDocument values by default.</description>
</property>

<property>
<name>index.arbitrary.overwrite.0</name>
<description>Whether to overwrite any existing value in the doc for
for fieldName. Default is false if not specified in config</description>
<value></value>
</property>

<!-- parse-metatags plugin properties -->
<property>
<name>metatags.names</name>
Expand Down
3 changes: 3 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
<ant dir="headings" target="deploy"/>
<ant dir="exchange-jexl" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-arbitrary" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-geoip" target="deploy"/>
<ant dir="index-jexl-filter" target="deploy"/>
Expand Down Expand Up @@ -117,6 +118,7 @@
<ant dir="feed" target="test"/>
<ant dir="headings" target="test"/>
<ant dir="index-anchor" target="test"/>
<ant dir="index-arbitrary" target="test"/>
<ant dir="index-basic" target="test"/>
<!--ant dir="index-geoip" target="test"/-->
<ant dir="index-jexl-filter" target="test"/>
Expand Down Expand Up @@ -179,6 +181,7 @@
<ant dir="headings" target="clean"/>
<ant dir="exchange-jexl" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-arbitrary" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-geoip" target="clean"/>
<ant dir="index-jexl-filter" target="clean"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/index-arbitrary/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="index-arbitrary" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
39 changes: 39 additions & 0 deletions src/plugin/index-arbitrary/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../../ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
</dependencies>

</ivy-module>
42 changes: 42 additions & 0 deletions src/plugin/index-arbitrary/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<plugin
id="index-arbitrary"
name="Index Arbitrary"
version="1.0.0"
provider-name="nutch.org">

<runtime>
<library name="index-arbitrary.jar">
<export name="*"/>
</library>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>


<extension id="org.apache.nutch.indexer.arbitrary"
name="Nutch arbitrary data indexer"
point="org.apache.nutch.indexer.IndexingFilter">
<implementation id="ArbitraryIndexingFilter"
class="org.apache.nutch.indexer.arbitrary.ArbitraryIndexingFilter"/>
</extension>

</plugin>
Loading
Loading