Drop duplicate rows based on specific columns (subset)
Returns new data frame with duplicate rows removed, where duplicates are determined by comparing only the specified columns. Keeps first occurrence.
@param[in] df The data frame instance @param[in] col_indices Array of column indices to check for duplicates @return New data frame with duplicate rows removed
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(data_frame), | intent(in) | :: | df | |||
integer, | intent(in) | :: | col_indices(:) |
function df_drop_duplicates_subset(df, col_indices) result(unique_df) type(data_frame), intent(in) :: df integer, intent(in) :: col_indices(:) type(data_frame) :: unique_df logical, dimension(:), allocatable :: is_dup, keep_mask integer, dimension(:), allocatable :: unique_indices integer :: i, j, k, row1, row2, dtype, num_unique, col_idx logical :: is_duplicate character(len=:), allocatable :: header_name real(rk), dimension(:), allocatable :: real_col, real_unique_col integer(ik), dimension(:), allocatable :: int_col, int_unique_col logical, dimension(:), allocatable :: log_col, log_unique_col character(len=:), allocatable :: char_col(:), char_unique_col(:) complex(rk), dimension(:), allocatable :: cmplx_col, cmplx_unique_col allocate (is_dup(df % nrows())) is_dup = .false. ! Check each row against all previous rows (only comparing subset columns) do row1 = 1, df % nrows() if (is_dup(row1)) cycle do row2 = row1 + 1, df % nrows() if (is_dup(row2)) cycle is_duplicate = .true. ! Only check columns in subset do k = 1, size(col_indices) col_idx = col_indices(k) dtype = df % dtype(col_idx) select case (dtype) case (REAL_NUM) real_col = df_get_col_real(df, col_idx) if (.not. ieee_is_nan(real_col(row1)) .and. .not. ieee_is_nan(real_col(row2))) then if (real_col(row1) /= real_col(row2)) then is_duplicate = .false. exit end if else if (ieee_is_nan(real_col(row1)) .neqv. ieee_is_nan(real_col(row2))) then is_duplicate = .false. exit end if case (INTEGER_NUM) int_col = df_get_col_integer(df, col_idx) if (.not. is_nan_integer(int_col(row1)) .and. .not. is_nan_integer(int_col(row2))) then if (int_col(row1) /= int_col(row2)) then is_duplicate = .false. exit end if else if (is_nan_integer(int_col(row1)) .neqv. is_nan_integer(int_col(row2))) then is_duplicate = .false. exit end if case (LOGICAL_NUM) log_col = df_get_col_logical(df, col_idx) if (log_col(row1) .neqv. log_col(row2)) then is_duplicate = .false. exit end if case (CHARACTER_NUM) char_col = df_get_col_character(df, col_idx) if (char_col(row1) /= char_col(row2)) then is_duplicate = .false. exit end if case (COMPLEX_NUM) cmplx_col = df_get_col_complex(df, col_idx) if (cmplx_col(row1) /= cmplx_col(row2)) then is_duplicate = .false. exit end if end select end do if (is_duplicate) then is_dup(row2) = .true. end if end do end do ! Build mask for rows to keep allocate (keep_mask(df % nrows())) keep_mask = .not. is_dup num_unique = count(keep_mask) ! Get indices of unique rows allocate (unique_indices(num_unique)) j = 0 do i = 1, df % nrows() if (keep_mask(i)) then j = j + 1 unique_indices(j) = i end if end do ! Build result data frame with all columns call unique_df % new(df % get_max_char_len()) do i = 1, df % ncols() dtype = df % dtype(i) select case (dtype) case (REAL_NUM) real_col = df_get_col_real(df, i) allocate (real_unique_col(num_unique)) do j = 1, num_unique real_unique_col(j) = real_col(unique_indices(j)) end do if (df % get_with_headers()) then header_name = df % header(i) call df_append_real(unique_df, real_unique_col, trim(header_name)) else call df_append_real(unique_df, real_unique_col) end if deallocate (real_unique_col) case (INTEGER_NUM) int_col = df_get_col_integer(df, i) allocate (int_unique_col(num_unique)) do j = 1, num_unique int_unique_col(j) = int_col(unique_indices(j)) end do if (df % get_with_headers()) then header_name = df % header(i) call df_append_integer(unique_df, int_unique_col, trim(header_name)) else call df_append_integer(unique_df, int_unique_col) end if deallocate (int_unique_col) case (LOGICAL_NUM) log_col = df_get_col_logical(df, i) allocate (log_unique_col(num_unique)) do j = 1, num_unique log_unique_col(j) = log_col(unique_indices(j)) end do if (df % get_with_headers()) then header_name = df % header(i) call df_append_logical(unique_df, log_unique_col, trim(header_name)) else call df_append_logical(unique_df, log_unique_col) end if deallocate (log_unique_col) case (CHARACTER_NUM) char_col = df_get_col_character(df, i) allocate (character(len=len(char_col)) :: char_unique_col(num_unique)) do j = 1, num_unique char_unique_col(j) = char_col(unique_indices(j)) end do if (df % get_with_headers()) then header_name = df % header(i) call df_append_character(unique_df, char_unique_col, trim(header_name)) else call df_append_character(unique_df, char_unique_col) end if deallocate (char_unique_col) case (COMPLEX_NUM) cmplx_col = df_get_col_complex(df, i) allocate (cmplx_unique_col(num_unique)) do j = 1, num_unique cmplx_unique_col(j) = cmplx_col(unique_indices(j)) end do if (df % get_with_headers()) then header_name = df % header(i) call df_append_complex(unique_df, cmplx_unique_col, trim(header_name)) else call df_append_complex(unique_df, cmplx_unique_col) end if deallocate (cmplx_unique_col) end select end do deallocate (is_dup, keep_mask, unique_indices) end function df_drop_duplicates_subset