df_drop_duplicates_subset Function

public function df_drop_duplicates_subset(df, col_indices) result(unique_df)

Drop duplicate rows based on specific columns (subset)

Returns new data frame with duplicate rows removed, where duplicates are determined by comparing only the specified columns. Keeps first occurrence.

@param[in] df The data frame instance @param[in] col_indices Array of column indices to check for duplicates @return New data frame with duplicate rows removed

Arguments

Type IntentOptional Attributes Name
type(data_frame), intent(in) :: df
integer, intent(in) :: col_indices(:)

Return Value type(data_frame)


Source Code

    function df_drop_duplicates_subset(df, col_indices) result(unique_df)
        type(data_frame), intent(in) :: df
        integer, intent(in) :: col_indices(:)
        type(data_frame) :: unique_df
        logical, dimension(:), allocatable :: is_dup, keep_mask
        integer, dimension(:), allocatable :: unique_indices
        integer :: i, j, k, row1, row2, dtype, num_unique, col_idx
        logical :: is_duplicate
        character(len=:), allocatable :: header_name
        real(rk), dimension(:), allocatable :: real_col, real_unique_col
        integer(ik), dimension(:), allocatable :: int_col, int_unique_col
        logical, dimension(:), allocatable :: log_col, log_unique_col
        character(len=:), allocatable :: char_col(:), char_unique_col(:)
        complex(rk), dimension(:), allocatable :: cmplx_col, cmplx_unique_col

        allocate (is_dup(df % nrows()))
        is_dup = .false.

        ! Check each row against all previous rows (only comparing subset columns)
        do row1 = 1, df % nrows()
            if (is_dup(row1)) cycle

            do row2 = row1 + 1, df % nrows()
                if (is_dup(row2)) cycle

                is_duplicate = .true.
                ! Only check columns in subset
                do k = 1, size(col_indices)
                    col_idx = col_indices(k)
                    dtype = df % dtype(col_idx)

                    select case (dtype)
                    case (REAL_NUM)
                        real_col = df_get_col_real(df, col_idx)
                        if (.not. ieee_is_nan(real_col(row1)) .and. .not. ieee_is_nan(real_col(row2))) then
                            if (real_col(row1) /= real_col(row2)) then
                                is_duplicate = .false.
                                exit
                            end if
                        else if (ieee_is_nan(real_col(row1)) .neqv. ieee_is_nan(real_col(row2))) then
                            is_duplicate = .false.
                            exit
                        end if
                    case (INTEGER_NUM)
                        int_col = df_get_col_integer(df, col_idx)
                        if (.not. is_nan_integer(int_col(row1)) .and. .not. is_nan_integer(int_col(row2))) then
                            if (int_col(row1) /= int_col(row2)) then
                                is_duplicate = .false.
                                exit
                            end if
                        else if (is_nan_integer(int_col(row1)) .neqv. is_nan_integer(int_col(row2))) then
                            is_duplicate = .false.
                            exit
                        end if
                    case (LOGICAL_NUM)
                        log_col = df_get_col_logical(df, col_idx)
                        if (log_col(row1) .neqv. log_col(row2)) then
                            is_duplicate = .false.
                            exit
                        end if
                    case (CHARACTER_NUM)
                        char_col = df_get_col_character(df, col_idx)
                        if (char_col(row1) /= char_col(row2)) then
                            is_duplicate = .false.
                            exit
                        end if
                    case (COMPLEX_NUM)
                        cmplx_col = df_get_col_complex(df, col_idx)
                        if (cmplx_col(row1) /= cmplx_col(row2)) then
                            is_duplicate = .false.
                            exit
                        end if
                    end select
                end do

                if (is_duplicate) then
                    is_dup(row2) = .true.
                end if
            end do
        end do

        ! Build mask for rows to keep
        allocate (keep_mask(df % nrows()))
        keep_mask = .not. is_dup
        num_unique = count(keep_mask)

        ! Get indices of unique rows
        allocate (unique_indices(num_unique))
        j = 0
        do i = 1, df % nrows()
            if (keep_mask(i)) then
                j = j + 1
                unique_indices(j) = i
            end if
        end do

        ! Build result data frame with all columns
        call unique_df % new(df % get_max_char_len())

        do i = 1, df % ncols()
            dtype = df % dtype(i)

            select case (dtype)
            case (REAL_NUM)
                real_col = df_get_col_real(df, i)
                allocate (real_unique_col(num_unique))
                do j = 1, num_unique
                    real_unique_col(j) = real_col(unique_indices(j))
                end do
                if (df % get_with_headers()) then
                    header_name = df % header(i)
                    call df_append_real(unique_df, real_unique_col, trim(header_name))
                else
                    call df_append_real(unique_df, real_unique_col)
                end if
                deallocate (real_unique_col)

            case (INTEGER_NUM)
                int_col = df_get_col_integer(df, i)
                allocate (int_unique_col(num_unique))
                do j = 1, num_unique
                    int_unique_col(j) = int_col(unique_indices(j))
                end do
                if (df % get_with_headers()) then
                    header_name = df % header(i)
                    call df_append_integer(unique_df, int_unique_col, trim(header_name))
                else
                    call df_append_integer(unique_df, int_unique_col)
                end if
                deallocate (int_unique_col)

            case (LOGICAL_NUM)
                log_col = df_get_col_logical(df, i)
                allocate (log_unique_col(num_unique))
                do j = 1, num_unique
                    log_unique_col(j) = log_col(unique_indices(j))
                end do
                if (df % get_with_headers()) then
                    header_name = df % header(i)
                    call df_append_logical(unique_df, log_unique_col, trim(header_name))
                else
                    call df_append_logical(unique_df, log_unique_col)
                end if
                deallocate (log_unique_col)

            case (CHARACTER_NUM)
                char_col = df_get_col_character(df, i)
                allocate (character(len=len(char_col)) :: char_unique_col(num_unique))
                do j = 1, num_unique
                    char_unique_col(j) = char_col(unique_indices(j))
                end do
                if (df % get_with_headers()) then
                    header_name = df % header(i)
                    call df_append_character(unique_df, char_unique_col, trim(header_name))
                else
                    call df_append_character(unique_df, char_unique_col)
                end if
                deallocate (char_unique_col)

            case (COMPLEX_NUM)
                cmplx_col = df_get_col_complex(df, i)
                allocate (cmplx_unique_col(num_unique))
                do j = 1, num_unique
                    cmplx_unique_col(j) = cmplx_col(unique_indices(j))
                end do
                if (df % get_with_headers()) then
                    header_name = df % header(i)
                    call df_append_complex(unique_df, cmplx_unique_col, trim(header_name))
                else
                    call df_append_complex(unique_df, cmplx_unique_col)
                end if
                deallocate (cmplx_unique_col)
            end select
        end do

        deallocate (is_dup, keep_mask, unique_indices)
    end function df_drop_duplicates_subset