diff --git a/.gitignore b/.gitignore index e793e106..e857fd6a 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ compile_commands.json /doc/ /Meta/ .claude/settings.local.json +.claude/plans/* diff --git a/R/read_fwf.R b/R/read_fwf.R index 3cb64072..a22bb907 100644 --- a/R/read_fwf.R +++ b/R/read_fwf.R @@ -1,29 +1,67 @@ -#' Read a fixed width file into a tibble +#' Read a fixed-width file into a tibble #' -#' A fixed width file can be a very compact representation of numeric data. -#' It's also very fast to parse, because every field is in the same place in -#' every line. Unfortunately, it's painful to parse because you need to -#' describe the length of every field. Readr aims to make it as easy as possible -#' by providing a number of different ways to describe the field structure. -#' - [fwf_empty()] - Guesses based on the positions of empty columns. -#' - [fwf_widths()] - Supply the widths of the columns. -#' - [fwf_positions()] - Supply paired vectors of start and end positions. -#' - [fwf_cols()] - Supply named arguments of paired start and end positions or column widths. +#' @description +#' Fixed-width files store tabular data with each field occupying a specific +#' range of character positions in every line. Once the fields are identified, +#' converting them to the appropriate R types works just like for delimited +#' files. The unique challenge with fixed-width files is describing where each +#' field begins and ends. \pkg{readr} tries to ease this pain by offering a +#' few different ways to specify the field structure: +#' - `fwf_empty()` - Guesses based on the positions of empty columns. This is +#' the default. (Note that `fwf_empty()` returns 0-based positions, for +#' internal use.) +#' - `fwf_widths()` - Supply the widths of the columns. +#' - `fwf_positions()` - Supply paired vectors of start and end positions. These +#' are interpreted as 1-based positions, so are off-by-one compared to the +#' output of `fwf_empty()`. +#' - `fwf_cols()` - Supply named arguments of paired start and end positions or +#' column widths. +#' +#' Note: `fwf_empty()` cannot work with a connection or with any of the input +#' types that involve a connection internally, which includes remote and +#' compressed files. The reason is that this would necessitate reading from the +#' connection twice. In these cases, you'll have to either provide the field +#' structure explicitly with another `fwf_*()` function or download (and +#' decompress, if relevant) the file first. +#' +#' @details +#' Here's a enhanced example using the contents of the file accessed via +#' `readr_example("fwf-sample.txt")`. +#' +#' ``` +#' 1 2 3 4 +#' 123456789012345678901234567890123456789012 +#' [ name 20 ][state 10][ ssn 12 ] +#' John Smith WA 418-Y11-4111 +#' Mary Hartford CA 319-Z19-4341 +#' Evan Nolan IL 219-532-c301 +#' ``` +#' +#' Here are some valid field specifications for the above (they aren't all +#' equivalent! but they are all valid): +#' +#' ``` +#' fwf_widths(c(20, 10, 12), c("name", "state", "ssn")) +#' fwf_positions(c(1, 30), c(20, 42), c("name", "ssn")) +#' fwf_cols(state = c(21, 30), last = c(6, 20), first = c(1, 4), ssn = c(31, 42)) +#' fwf_cols(name = c(1, 20), ssn = c(30, 42)) +#' fwf_cols(name = 20, state = 10, ssn = 12) +#' ``` #' #' @seealso [read_table()] to read fixed width files where each #' column is separated by whitespace. #' #' @section Second edition changes: -#' Comments are no longer looked for anywhere in the file. -#' They are now only ignored at the start of a line. +#' Comments are now only ignored if they appear at the start of a line. +#' Comments elsewhere in a line are no longer treated specially. #' #' @inheritParams datasource #' @inheritParams tokenizer_fwf #' @inheritParams read_delim #' @param col_positions Column positions, as created by [fwf_empty()], -#' [fwf_widths()] or [fwf_positions()]. To read in only selected fields, -#' use [fwf_positions()]. If the width of the last column is variable (a -#' ragged fwf file), supply the last end position as NA. +#' `fwf_widths()`, `fwf_positions()`, or `fwf_cols()`. To read in only +#' selected fields, use `fwf_positions()`. If the width of the last column +#' is variable (a ragged fwf file), supply the last end position as `NA`. #' @export #' @examples #' fwf_sample <- readr_example("fwf-sample.txt") @@ -181,8 +219,8 @@ fwf_empty <- function( #' @rdname read_fwf #' @export -#' @param widths Width of each field. Use NA as width of last field when -#' reading a ragged fwf file. +#' @param widths Width of each field. Use `NA` as the width of the last field +#' when reading a ragged fixed-width file. #' @param col_names Either NULL, or a character vector column names. fwf_widths <- function(widths, col_names = NULL) { if (edition_first()) { @@ -195,7 +233,9 @@ fwf_widths <- function(widths, col_names = NULL) { #' @rdname read_fwf #' @export #' @param start,end Starting and ending (inclusive) positions of each field. -#' Use NA as last end field when reading a ragged fwf file. +#' **Positions are 1-based**: the first character in a line is at position 1. +#' Use `NA` as the last value of `end` when reading a ragged fixed-width +#' file. fwf_positions <- function(start, end = NULL, col_names = NULL) { if (edition_first()) { stopifnot(length(start) == length(end)) diff --git a/R/source.R b/R/source.R index 26e10ba7..a8b30c3b 100644 --- a/R/source.R +++ b/R/source.R @@ -6,7 +6,7 @@ #' Files ending in `.gz`, `.bz2`, `.xz`, or `.zip` will #' be automatically uncompressed. Files starting with `http://`, #' `https://`, `ftp://`, or `ftps://` will be automatically -#' downloaded. Remote gz files can also be automatically downloaded and +#' downloaded. Remote `.gz` files can also be automatically downloaded and #' decompressed. #' #' Literal data is most useful for examples and tests. To be recognised as diff --git a/man/count_fields.Rd b/man/count_fields.Rd index eec16fd8..fc8793d8 100644 --- a/man/count_fields.Rd +++ b/man/count_fields.Rd @@ -13,7 +13,7 @@ count_fields(file, tokenizer, skip = 0, n_max = -1L) Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/datasource.Rd b/man/datasource.Rd index 0ff061a2..8fc2db13 100644 --- a/man/datasource.Rd +++ b/man/datasource.Rd @@ -19,7 +19,7 @@ datasource( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_delim.Rd b/man/read_delim.Rd index 70e0c141..b6a033b3 100644 --- a/man/read_delim.Rd +++ b/man/read_delim.Rd @@ -109,7 +109,7 @@ read_tsv( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_delim_chunked.Rd b/man/read_delim_chunked.Rd index 4c3d4c45..358113f3 100644 --- a/man/read_delim_chunked.Rd +++ b/man/read_delim_chunked.Rd @@ -93,7 +93,7 @@ read_tsv_chunked( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_file.Rd b/man/read_file.Rd index 4baba11d..6fa9f38c 100644 --- a/man/read_file.Rd +++ b/man/read_file.Rd @@ -19,7 +19,7 @@ write_file(x, file, append = FALSE) Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_fwf.Rd b/man/read_fwf.Rd index ae745e21..7544a21a 100644 --- a/man/read_fwf.Rd +++ b/man/read_fwf.Rd @@ -6,7 +6,7 @@ \alias{fwf_widths} \alias{fwf_positions} \alias{fwf_cols} -\title{Read a fixed width file into a tibble} +\title{Read a fixed-width file into a tibble} \usage{ read_fwf( file, @@ -51,7 +51,7 @@ fwf_cols(...) Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as @@ -62,9 +62,9 @@ string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}}, -\code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields, -use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a -ragged fwf file), supply the last end position as NA.} +\code{fwf_widths()}, \code{fwf_positions()}, or \code{fwf_cols()}. To read in only +selected fields, use \code{fwf_positions()}. If the width of the last column +is variable (a ragged fwf file), supply the last end position as \code{NA}.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. @@ -187,11 +187,13 @@ option is \code{TRUE} then blank rows will not be represented at all. If it is \item{n}{Number of lines the tokenizer will read to determine file structure. By default it is set to 100.} -\item{widths}{Width of each field. Use NA as width of last field when -reading a ragged fwf file.} +\item{widths}{Width of each field. Use \code{NA} as the width of the last field +when reading a ragged fixed-width file.} \item{start, end}{Starting and ending (inclusive) positions of each field. -Use NA as last end field when reading a ragged fwf file.} +\strong{Positions are 1-based}: the first character in a line is at position 1. +Use \code{NA} as the last value of \code{end} when reading a ragged fixed-width +file.} \item{...}{If the first element is a data frame, then it must have all numeric columns and either one or two rows. @@ -201,22 +203,57 @@ positions. The elements of \code{...} are used to construct a data frame with or or two rows as above.} } \description{ -A fixed width file can be a very compact representation of numeric data. -It's also very fast to parse, because every field is in the same place in -every line. Unfortunately, it's painful to parse because you need to -describe the length of every field. Readr aims to make it as easy as possible -by providing a number of different ways to describe the field structure. +Fixed-width files store tabular data with each field occupying a specific +range of character positions in every line. Once the fields are identified, +converting them to the appropriate R types works just like for delimited +files. The unique challenge with fixed-width files is describing where each +field begins and ends. \pkg{readr} tries to ease this pain by offering a +few different ways to specify the field structure: \itemize{ -\item \code{\link[=fwf_empty]{fwf_empty()}} - Guesses based on the positions of empty columns. -\item \code{\link[=fwf_widths]{fwf_widths()}} - Supply the widths of the columns. -\item \code{\link[=fwf_positions]{fwf_positions()}} - Supply paired vectors of start and end positions. -\item \code{\link[=fwf_cols]{fwf_cols()}} - Supply named arguments of paired start and end positions or column widths. +\item \code{fwf_empty()} - Guesses based on the positions of empty columns. This is +the default. (Note that \code{fwf_empty()} returns 0-based positions, for +internal use.) +\item \code{fwf_widths()} - Supply the widths of the columns. +\item \code{fwf_positions()} - Supply paired vectors of start and end positions. These +are interpreted as 1-based positions, so are off-by-one compared to the +output of \code{fwf_empty()}. +\item \code{fwf_cols()} - Supply named arguments of paired start and end positions or +column widths. } + +Note: \code{fwf_empty()} cannot work with a connection or with any of the input +types that involve a connection internally, which includes remote and +compressed files. The reason is that this would necessitate reading from the +connection twice. In these cases, you'll have to either provide the field +structure explicitly with another \verb{fwf_*()} function or download (and +decompress, if relevant) the file first. +} +\details{ +Here's a enhanced example using the contents of the file accessed via +\code{readr_example("fwf-sample.txt")}. + +\if{html}{\out{
}}\preformatted{ 1 2 3 4 +123456789012345678901234567890123456789012 +[ name 20 ][state 10][ ssn 12 ] +John Smith WA 418-Y11-4111 +Mary Hartford CA 319-Z19-4341 +Evan Nolan IL 219-532-c301 +}\if{html}{\out{
}} + +Here are some valid field specifications for the above (they aren't all +equivalent! but they are all valid): + +\if{html}{\out{
}}\preformatted{fwf_widths(c(20, 10, 12), c("name", "state", "ssn")) +fwf_positions(c(1, 30), c(20, 42), c("name", "ssn")) +fwf_cols(state = c(21, 30), last = c(6, 20), first = c(1, 4), ssn = c(31, 42)) +fwf_cols(name = c(1, 20), ssn = c(30, 42)) +fwf_cols(name = 20, state = 10, ssn = 12) +}\if{html}{\out{
}} } \section{Second edition changes}{ -Comments are no longer looked for anywhere in the file. -They are now only ignored at the start of a line. +Comments are now only ignored if they appear at the start of a line. +Comments elsewhere in a line are no longer treated specially. } \examples{ diff --git a/man/read_lines.Rd b/man/read_lines.Rd index 0107aa53..04e9e3f0 100644 --- a/man/read_lines.Rd +++ b/man/read_lines.Rd @@ -42,7 +42,7 @@ write_lines( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_lines_chunked.Rd b/man/read_lines_chunked.Rd index 23825ef3..c7466046 100644 --- a/man/read_lines_chunked.Rd +++ b/man/read_lines_chunked.Rd @@ -30,7 +30,7 @@ read_lines_raw_chunked( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_log.Rd b/man/read_log.Rd index f9a1b012..7db02c38 100644 --- a/man/read_log.Rd +++ b/man/read_log.Rd @@ -22,7 +22,7 @@ read_log( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/read_table.Rd b/man/read_table.Rd index 202f8b8b..4d7ddd4a 100644 --- a/man/read_table.Rd +++ b/man/read_table.Rd @@ -26,7 +26,7 @@ read_table( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/spec_delim.Rd b/man/spec_delim.Rd index fe0aa816..72894167 100644 --- a/man/spec_delim.Rd +++ b/man/spec_delim.Rd @@ -125,7 +125,7 @@ spec_table( Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/tokenize.Rd b/man/tokenize.Rd index a1d814d4..932c9c47 100644 --- a/man/tokenize.Rd +++ b/man/tokenize.Rd @@ -13,7 +13,7 @@ tokenize(file, tokenizer = tokenizer_csv(), skip = 0, n_max = -1L) Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically -downloaded. Remote gz files can also be automatically downloaded and +downloaded. Remote \code{.gz} files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as diff --git a/man/write_delim.Rd b/man/write_delim.Rd index 500ac5f7..1ed19639 100644 --- a/man/write_delim.Rd +++ b/man/write_delim.Rd @@ -133,10 +133,9 @@ vectors. If your data contains newlines within fields the parser will automatically be forced to use a single thread only.} \item{progress}{Display a progress bar? By default it will only display -in an interactive session and not while knitting a document. The display -is updated every 50,000 values and will only display if estimated reading -time is 5 seconds or more. The automatic progress bar can be disabled by -setting option \code{readr.show_progress} to \code{FALSE}.} +in an interactive session and not while executing in an RStudio notebook +chunk. The display of the progress bar can be disabled by setting the +environment variable \code{VROOM_SHOW_PROGRESS} to \code{"false"}.} } \value{ \verb{write_*()} returns the input \code{x} invisibly.