1

Sorry I'm totally new to RapidMiner and only made the basic tutorial.

I have a dataset like

MatchID   Value1   Value2   Value3
1            5        1        2
1           4.5      1.5       2
...

and would like to know if there is a possibilty to get the highest value per column (for example Value1) and make further calculations with it (generate attributes).

Thank you.

neodymium
  • 67
  • 7

3 Answers3

3

There are lots of ways as it happens. Here's one using the Aggregate operator to find the maxima, Join to join this to the original and Generate Attributes to do some calculating.

<?xml version="1.0" encoding="UTF-8"?><process version="7.2.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.2.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.2.003" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Samples/data/Iris"/>
      </operator>
      <operator activated="true" class="aggregate" compatibility="7.2.003" expanded="true" height="82" name="Aggregate" width="90" x="179" y="34">
    <parameter key="use_default_aggregation" value="true"/>
    <parameter key="default_aggregation_function" value="maximum"/>
    <list key="aggregation_attributes"/>
      </operator>
      <operator activated="true" class="join" compatibility="7.2.003" expanded="true" height="82" name="Join" width="90" x="313" y="34">
    <parameter key="join_type" value="outer"/>
    <parameter key="use_id_attribute_as_key" value="false"/>
    <list key="key_attributes"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="7.2.003" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="34">
    <list key="function_descriptions">
      <parameter key="deltaA1" value="[maximum(a1)]-a1"/>
      <parameter key="deltaA2" value="[maximum(a2)]-a2"/>
      <parameter key="deltaA3" value="[maximum(a3)]-a3"/>
      <parameter key="deltaA4" value="[maximum(a4)]-a4"/>
    </list>
      </operator>
      <connect from_op="Retrieve Iris" from_port="output" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Aggregate" from_port="original" to_op="Join" to_port="right"/>
      <connect from_op="Join" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Andrew Chisholm
  • 6,362
  • 2
  • 22
  • 41
0

Another way is to use the Extract Macro operator with the statistics setting max. This stores the maximum for a given attribute as a macro value, which then can be used, e.g. in Generate Attributes.

The advantage is that you don't modify the original dataset and don't have to use a join or multiply operator.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Samples/data/Iris"/>
      </operator>
      <operator activated="true" class="extract_macro" compatibility="7.5.000" expanded="true" height="68" name="Extract Macro" width="90" x="179" y="34">
        <parameter key="macro" value="maxA1"/>
        <parameter key="macro_type" value="statistics"/>
        <parameter key="statistics" value="max"/>
        <parameter key="attribute_name" value="a1"/>
        <list key="additional_macros"/>
        <description align="center" color="transparent" colored="false" width="126">extract maximum of attribute a1 and store it in a macro</description>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="7.5.000" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
        <list key="function_descriptions">
          <parameter key="DifferenceA1" value="parse(%{maxA1})-a1"/>
        </list>
        <description align="center" color="transparent" colored="false" width="126">calculate the difference of a1 from the maximum using the macro value</description>
      </operator>
      <connect from_op="Retrieve Iris" from_port="output" to_op="Extract Macro" to_port="example set"/>
      <connect from_op="Extract Macro" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Hint: since macro values are stored as text, you first have to parse them to use their numerical value.

Christian König
  • 3,437
  • 16
  • 28
0

A third option is to Sort the example set and only keep the example with the maximum value with a Filter Example Range operator. This comes in handy, if you are mostly interested in the values of other attributes, when a certain attribute is maximal.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Samples/data/Iris"/>
      </operator>
      <operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="179" y="34">
        <parameter key="attribute_name" value="a1"/>
        <parameter key="sorting_direction" value="decreasing"/>
        <description align="center" color="transparent" colored="false" width="126">sorting the example set on a1 decreasing</description>
      </operator>
      <operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="313" y="34">
        <parameter key="first_example" value="1"/>
        <parameter key="last_example" value="1"/>
        <description align="center" color="transparent" colored="false" width="126">only keeping the first example, which has the maximum for a1</description>
      </operator>
      <connect from_op="Retrieve Iris" from_port="output" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
      <connect from_op="Filter Example Range" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Christian König
  • 3,437
  • 16
  • 28