Documentation

DataFort Guide


Reading and Writing Data

Load CSV with Headers

type(data_frame) :: df
call df_read_csv(df, "data.csv", .true.)  ! .true. = has headers
call df_write_console(df)

Write to CSV

call df_write_csv(df, "output.csv")

Data Exploration

Display First/Last Rows

type(data_frame) :: head_df, tail_df
head_df = df_head(df, 5)     ! First 5 rows
tail_df = df_tail(df, 10)    ! Last 10 rows
call df_write_console(head_df)

Get DataFrame Info

call df_info(df)           ! Print structure and types
call df_describe_numeric(df)  ! Statistical summary

print*, "Shape:", df_shape(df)
print*, "Rows:", df%nrows()
print*, "Cols:", df%ncols()

Access Individual Elements

Get a Single Value

real(rk) :: value_r
integer(ik) :: value_i
character(len=:), allocatable :: value_c
logical :: value_l

! Get value at row i, column j (by index)
call df_get_val_real(df, i, j, value_r)       ! For real columns
call df_get_val_integer(df, i, j, value_i)    ! For integer columns
call df_get_val_character(df, i, j, value_c)  ! For character columns
call df_get_val_logical(df, i, j, value_l)    ! For logical columns

! Example: Get value at row 5, column 2
call df_get_val_real(df, 5, 2, value_r)
print*, "Value at [5,2]:", value_r

Get Value by Column Name

character(len=:), allocatable :: col_name
real(rk) :: temperature
integer :: row_idx, col_idx

row_idx = 10
col_name = "Temperature"

! First find column index
col_idx = df%find_header_index(col_name)

! Then get the value
call df_get_val_real(df, row_idx, col_idx, temperature)
print*, "Temperature at row", row_idx, ":", temperature

Set a Single Value

! Set value at row i, column j
call df_set_val_real(df, i, j, 42.5_rk)        ! For real columns
call df_set_val_integer(df, i, j, 100_ik)      ! For integer columns
call df_set_val_character(df, i, j, "NewVal")  ! For character columns
call df_set_val_logical(df, i, j, .true.)      ! For logical columns

! Example: Set value at row 3, column 1
call df_set_val_real(df, 3, 1, 99.9_rk)

Access by Header Name

character(len=:), allocatable :: header
integer :: col_index

! Get column index from header name
header = "Age"
col_index = df%find_header_index(header)

if (col_index > 0) then
    ! Use the column index to access data
    call df_get_val_integer(df, 5, col_index, value_i)
    print*, "Age at row 5:", value_i
else
    print*, "Column not found"
end if

Data Selection and Filtering

Select Columns

type(data_frame) :: subset
integer, dimension(3) :: cols = [1, 3, 5]
subset = df_select_columns(df, cols)

Slice Rows

type(data_frame) :: sliced
sliced = df_slice_rows(df, 10, 20)  ! Rows 10 to 20

Filter by Condition

type(data_frame) :: filtered
logical, dimension(:), allocatable :: mask

! Filter numeric range
filtered = df_filter_rows_real_range(df, 1, 20.0_rk, 30.0_rk)

! Filter by string pattern
filtered = df_filter_rows_string_pattern(df, 2, "Alice")

! Custom logical mask
allocate(mask(df%nrows()))
mask = temperatures > 25.0_rk
filtered = df_filter_rows_logical(df, mask)

Boolean Indexing (Pandas-like Filtering)

DataFort supports pandas-style boolean indexing using comparison operators and logical operations.

Basic Comparisons

use datafort_boolean_indexing
type(data_frame) :: df, filtered
type(boolean_mask) :: mask

! Create comparison masks
mask = gt_real(df, "temperature", 25.0_rk)      ! temperature > 25
mask = lt_real(df, "humidity", 60.0_rk)         ! humidity < 60
mask = ge_integer(df, "count", 10_ik)           ! count >= 10
mask = le_integer(df, "age", 65_ik)             ! age <= 65
mask = eq_real(df, "price", 99.99_rk)           ! price == 99.99
mask = ne_character(df, "status", "inactive")   ! status != "inactive"
mask = contains_character(df, "name", "John")   ! name contains "John"

! Filter dataframe
filtered = df_filter(df, mask)

Combined Conditions

type(boolean_mask) :: mask1, mask2, combined

! AND: temperature > 25 AND humidity < 60
mask1 = gt_real(df, "temperature", 25.0_rk)
mask2 = lt_real(df, "humidity", 60.0_rk)
combined = mask_and(mask1, mask2)
filtered = df_filter(df, combined)

! OR: age < 18 OR age > 65
mask1 = lt_integer(df, "age", 18_ik)
mask2 = gt_integer(df, "age", 65_ik)
combined = mask_or(mask1, mask2)
filtered = df_filter(df, combined)

! NOT: NOT(status == "inactive")
mask1 = eq_character(df, "status", "inactive")
combined = mask_not(mask1)
filtered = df_filter(df, combined)

Complex Filtering Examples

! Range filter: 20 <= temperature <= 30
mask1 = ge_real(df, "temperature", 20.0_rk)
mask2 = le_real(df, "temperature", 30.0_rk)
filtered = df_filter(df, mask_and(mask1, mask2))

! Multiple conditions: (temp > 25 AND humidity < 60) OR pressure > 1013
mask1 = gt_real(df, "temperature", 25.0_rk)
mask2 = lt_real(df, "humidity", 60.0_rk)
mask3 = gt_real(df, "pressure", 1013.0_rk)
combined = mask_or(mask_and(mask1, mask2), mask3)
filtered = df_filter(df, combined)

! Exclude outliers: NOT(value < Q1 - 1.5*IQR OR value > Q3 + 1.5*IQR)
real(rk) :: q1, q3, iqr, lower, upper
q1 = df_percentile_real(df, 1, 25.0_rk)
q3 = df_percentile_real(df, 1, 75.0_rk)
iqr = q3 - q1
lower = q1 - 1.5_rk * iqr
upper = q3 + 1.5_rk * iqr
mask1 = lt_real(df, "value", lower)
mask2 = gt_real(df, "value", upper)
combined = mask_not(mask_or(mask1, mask2))
filtered = df_filter(df, combined)

Using Column Index

! Can use column index instead of name
mask = gt_real(df, 1, 100.0_rk)  ! Column 1 > 100
filtered = df_filter(df, mask)

Statistics and Aggregation

Basic Statistics

real(rk) :: avg, std_dev, var
integer(ik) :: total

! Column 1 statistics
avg = df_mean_real(df, 1)
std_dev = df_std_real(df, 1)
var = df_variance_real(df, 1)
total = df_sum_integer(df, 2)

print*, "Mean:", avg
print*, "Std Dev:", std_dev

Percentiles and Quantiles

real(rk) :: median, q25, q75

median = df_median_real(df, 1)
q25 = df_percentile_real(df, 1, 25.0_rk)
q75 = df_percentile_real(df, 1, 75.0_rk)

print*, "Median:", median
print*, "IQR:", q75 - q25

Correlation

real(rk) :: corr
corr = df_correlation_real(df, 1, 2)  ! Between columns 1 and 2
print*, "Correlation:", corr

Data Transformation

Normalize Data

call df_normalize_column_real(df, 1)  ! Scale to [0, 1]

Standardize Data

call df_standardize_column_real(df, 1)  ! z-score normalization

Mathematical Operations

! Apply functions to entire columns
call df_abs_column_real(df, 1)
call df_log_column(df, 1)
call df_exp_column(df, 1)
call df_sqrt_column(df, 1)
call df_pow_column(df, 1, 2.0_rk)  ! Raise to power
call df_round_column(df, 1, 2)     ! Round to 2 decimals

Cumulative and Differencing

real(rk), dimension(:), allocatable :: cumulative, differences

cumulative = df_cumsum_real(df, 1)
differences = df_diff_real(df, 1)

Sorting and Ranking

Sort by Column

call df_sort_by_column(df, 1, ascending=.true.)

Get Ranks

real(rk), dimension(:), allocatable :: ranks
ranks = df_rank_real(df, 1)

Check if Sorted

logical :: sorted
sorted = df_is_sorted_real(df, 1)

Missing Data (NaN) Handling

Check for Missing Values

logical, dimension(:), allocatable :: na_mask
na_mask = df_isna_real(df, 1)

! Count missing values
print*, "Missing values:", count(na_mask)

Fill Missing Values

call df_fillna_real(df, 1, 0.0_rk)  ! Fill with 0

Drop Missing Values

type(data_frame) :: cleaned
cleaned = df_dropna(df)

Column Operations

Add Column

real(rk), dimension(100) :: new_data
call df_append_real(df, new_data, "NewColumn")

Drop Column

call df_drop_column(df, 3)  ! Drop column 3

Rename Column

call df_rename_column(df, 1, "Temperature_C")

Reorder Columns

integer, dimension(4) :: new_order = [3, 1, 4, 2]
call df_reorder_columns(df, new_order)

Get Column Type

integer :: col_type
col_type = df%get_col_type(1)
! Returns: 1=Real, 2=Integer, 3=Logical, 4=Character, 5=Complex

Joining and Merging

Inner Join

type(data_frame) :: df1, df2, result
result = df_inner_join(df1, df2, 1, 1)  ! Join on column 1 of both

Left Join

result = df_left_join(df1, df2, 1, 1)

Right Join

result = df_right_join(df1, df2, 1, 1)

Outer Join

result = df_outer_join(df1, df2, 1, 1)

Concatenate DataFrames

type(data_frame) :: combined
integer :: axis
axis = 0  ! 0 = vertical stacking, 1 = horizontal stacking
combined = df_concat(df1, df2, axis)

Duplicates

Find Duplicates

logical, dimension(:), allocatable :: is_dup
is_dup = df_duplicated(df)  ! Check for duplicate rows

Drop Duplicates

type(data_frame) :: unique_df
unique_df = df_drop_duplicates(df)  ! Remove duplicate rows

Get Unique Values

real(rk), dimension(:), allocatable :: unique_vals
unique_vals = df_unique_real(df, 1)

Value Counts

type(data_frame) :: counts
counts = df_value_counts_real(df, 1)
call df_write_console(counts)

Row Operations

Apply Function to Row

! Define a function that operates on a row
function row_sum(row_values, num_cols) result(output)
    use precision
    real(rk), dimension(:), intent(in) :: row_values
    integer, intent(in) :: num_cols
    real(rk) :: output
    output = sum(row_values)
end function row_sum

! Apply to a single row
real(rk) :: result
result = df_apply_to_row_real(df, 5, row_sum)  ! Apply to row 5

Apply Function to All Rows

real(rk), dimension(:), allocatable :: results
results = df_apply_to_all_rows_real(df, row_sum)  ! Apply to all rows

Other Operations

Transpose

type(data_frame) :: transposed
transposed = df_transpose(df)

Sample Rows

type(data_frame) :: sample_df
sample_df = df_sample(df, 10)  ! Random 10 rows

Shuffle

call df_shuffle(df)  ! Randomize row order

Copy

type(data_frame) :: df_copy
df_copy = df_copy(df)

Clear

call df_clear(df)  ! Empty the dataframe

Complete Example

program datafort_example
    use datafort
    use precision
    implicit none

    type(data_frame) :: df, filtered, stats
    real(rk), dimension(5) :: temps = [23.1_rk, 25.3_rk, 24.8_rk, 22.5_rk, 26.0_rk]
    integer(ik), dimension(5) :: ids = [1_ik, 2_ik, 3_ik, 4_ik, 5_ik]

    ! Create dataframe
    call df%new()
    call df_append_integer(df, ids, "ID")
    call df_append_real(df, temps, "Temperature")

    ! Basic info
    call df_info(df)
    call df_describe_numeric(df)

    ! Statistics
    print*, "Mean temp:", df_mean_real(df, 2)
    print*, "Max temp:", df_max_real(df, 2)

    ! Filter
    filtered = df_filter_rows_real_range(df, 2, 24.0_rk, 26.0_rk)
    call df_write_console(filtered)

    ! Transform
    call df_normalize_column_real(df, 2)

    ! Export
    call df_write_csv(df, "output.csv")

    ! Cleanup
    call df%destroy()
    call filtered%destroy()

end program datafort_example