df_sample Function

public function df_sample(df, n, seed) result(sampled_df)

Get n random rows from the dataframe

@param[in] df The data frame to sample from @param[in] n Number of rows to sample @param[in] seed Optional random seed @return New data frame with sampled rows

Arguments

Type IntentOptional Attributes Name
type(data_frame), intent(in) :: df
integer, intent(in) :: n
integer, intent(in), optional :: seed

Return Value type(data_frame)


Source Code

    function df_sample(df, n, seed) result(sampled_df)
        type(data_frame), intent(in) :: df
        integer, intent(in) :: n
        integer, intent(in), optional :: seed
        type(data_frame) :: sampled_df

        integer, dimension(:), allocatable :: indices, selected_indices, seed_array
        integer :: i, j, temp, num_samples, seed_size
        real :: rand_val

        num_samples = min(n, df % nrows())

        if (num_samples == 0) then
            call sampled_df % new(df % get_max_char_len())
            return
        end if

        ! Initialize random seed if provided
        if (present(seed)) then
            call random_seed(size=seed_size)
            allocate (seed_array(seed_size))
            seed_array = seed
            call random_seed(put=seed_array)
            deallocate (seed_array)
        end if

        ! Create array of all indices
        allocate (indices(df % nrows()))
        do i = 1, df % nrows()
            indices(i) = i
        end do

        ! Fisher-Yates shuffle to get random sample
        do i = df % nrows(), 2, -1
            call random_number(rand_val)
            j = int(rand_val * i) + 1
            temp = indices(i)
            indices(i) = indices(j)
            indices(j) = temp
        end do

        ! Take first n shuffled indices
        allocate (selected_indices(num_samples))
        selected_indices = indices(1:num_samples)

        ! Create sampled dataframe
        call sampled_df % new(df % get_max_char_len())
        do i = 1, df % ncols()
            call copy_filtered_column(df, sampled_df, i, selected_indices)
        end do

        deallocate (indices, selected_indices)
    end function df_sample