1

I have rapidminer example set like this,

ID   Issue       Exp  
100  9/8/2020    11/8/2020
100  8/5/2019    9/5/2019
101  6/3/2020    10/1/2020
102  8/15/2020   12/12/2020

I want to add a new column which will count the occurrence of the ID by adding the numbers and sort by the earliest date so we know at what date how many count I had.

Output like this,

ID   Issue       Exp         Count  
100  8/5/2019    9/5/2019    1
100  9/8/2020    11/8/2020   2
101  6/3/2020    10/1/2020   1
102  8/15/2020   12/12/2020  1

But when I aggregate by ID and do a count it will just count the total instead and show them for the same ID. So, for ID 100 it shows me 2 both the times because it is just adding the numbers both the times.

For example, for ID 100 in 2019 we had only 1 issue date hence count is 1, when we find ID 100 again at 2020 the count will be 2. So, the sort by date is also important because it will help us find the ID occurrence in correct order.

Any help is appreciated.

Thanks.

Tahsin Alam
  • 147
  • 1
  • 10

1 Answers1

2

One approach is to use the Loop Values operator to loop through all the possible values of the ID operator, use this value to filter the example set (which has already been sorted), generate a new incrementing id from this filtered set and finally append all the filtered examples back together.

Here's the process and corresponding XML to do this.

enter image description here

<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.9.000" expanded="true" height="68" name="Retrieve occById" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Local Repository/data/occById"/>
      </operator>
      <operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="179" y="34">
        <list key="sort_by">
          <parameter key="ID" value="ascending"/>
          <parameter key="Issue" value="ascending"/>
        </list>
      </operator>
      <operator activated="true" class="concurrency:loop_values" compatibility="9.9.000" expanded="true" height="82" name="Loop Values" width="90" x="313" y="34">
        <parameter key="attribute" value="ID"/>
        <parameter key="iteration_macro" value="loop_value"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="filter_examples" compatibility="9.9.000" expanded="true" height="103" name="Filter Examples" width="90" x="112" y="34">
            <parameter key="parameter_string" value="ID=%{loop_value}"/>
            <parameter key="parameter_expression" value=""/>
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="invert_filter" value="false"/>
            <list key="filters_list">
              <parameter key="filters_entry_key" value="ID.eq.%{loop_value}"/>
            </list>
            <parameter key="filters_logic_and" value="true"/>
            <parameter key="filters_check_metadata" value="true"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="9.9.000" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
            <parameter key="create_nominal_ids" value="false"/>
            <parameter key="offset" value="0"/>
          </operator>
          <connect from_port="input 1" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="append" compatibility="9.9.000" expanded="true" height="82" name="Append" width="90" x="447" y="34">
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="merge_type" value="all"/>
      </operator>
      <connect from_op="Retrieve occById" from_port="output" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Loop Values" to_port="input 1"/>
      <connect from_op="Loop Values" from_port="output 1" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

The input data was handcrafted and stored with name occById in the local repository - it looks like this.

enter image description here

The result is below.

enter image description here

Andrew Chisholm
  • 6,362
  • 2
  • 22
  • 41