A relatively complex process but here's a worked example you can copy.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.0.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="112" y="34">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="k_means" compatibility="7.0.000" expanded="true" height="82" name="Clustering" width="90" x="246" y="34">
<parameter key="k" value="10"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.0.000" expanded="true" height="82" name="Generate Attributes" width="90" x="246" y="136">
<list key="function_descriptions">
<parameter key="category" value="if(rand()>0.5, "state", "notstate")"/>
<parameter key="categoryNumeric" value="if(category=="state", 1, 0)"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="7.0.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="238">
<list key="aggregation_attributes">
<parameter key="categoryNumeric" value="average"/>
</list>
<parameter key="group_by_attributes" value="cluster"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.0.000" expanded="true" height="82" name="Generate Attributes (4)" width="90" x="380" y="340">
<list key="function_descriptions">
<parameter key="description" value="if ([average(categoryNumeric)]>0.5, "state","private")"/>
</list>
</operator>
<operator activated="true" class="join" compatibility="7.0.000" expanded="true" height="82" name="Join" width="90" x="514" y="238">
<parameter key="join_type" value="left"/>
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="cluster" value="cluster"/>
</list>
</operator>
<operator activated="true" class="jdbc_connectors:write_database" compatibility="7.0.000" expanded="true" height="68" name="Write Database" width="90" x="715" y="238">
<parameter key="connection" value="LocalMYSQL"/>
<parameter key="schema_name" value="ascom"/>
<parameter key="table_name" value="joinresult"/>
</operator>
<connect from_op="Retrieve Iris" from_port="output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 1"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Generate Attributes (4)" to_port="example set input"/>
<connect from_op="Aggregate" from_port="original" to_op="Join" to_port="left"/>
<connect from_op="Generate Attributes (4)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Write Database" to_port="input"/>
<connect from_op="Write Database" from_port="through" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
The main points are
- Create an attribute corresponding to
category
called categoryNumeric
which is set to 1 if category
is state
and 0 otherwise.
- Aggregate by cluster and take the average of
categoryNumeric
. If any aggregation value is greater than 0.5, it means the majority of the examples for a cluster have category
equal to state
.
- Create a new attribute in the aggregation result called
description
based on the majority determination.
- Each cluster now has additional data and it can be joined to the original data using the cluster identifier as a key.
- Write to a database (I used MySQL)
Hope this helps as a start.