Overhaul archive fgets function

The old function was written in a time before we relied on it for nearly
every operation. Since then, we have switched to the archive backend and now
fast parsing is a big deal.

The former function made a per-character call to the libarchive
archive_read_data() function, which resulted in some 21 million calls in a
typical "load all sync dbs" operation. If we instead do some buffering of
our own and read the blocks directly, and then find our newlines from there,
we can cut out the multiple layers of overhead and go from archive to parsed
data much quicker.

Both users of the former function are switched over to the new signature,
made easier by the macros now in place in the sync backend parsing code.

Performance: for a `pacman -Su` (no upgrades available),
_alpm_archive_fgets() goes from being 29% of the total time to 12% The time
spent on the libarchive function being called dropped from 24% to 6%.

This pushes _alpm_pkg_find back to the title of slowest low-level function.

Signed-off-by: Dan McGee <dan@archlinux.org>
This commit is contained in:
Dan McGee 2010-12-14 22:26:23 -06:00
parent 126f50ab0b
commit f2dff08600
4 changed files with 113 additions and 31 deletions

View file

@ -155,17 +155,22 @@ static struct pkg_operations *get_file_pkg_ops(void)
*/ */
static int parse_descfile(struct archive *a, pmpkg_t *newpkg) static int parse_descfile(struct archive *a, pmpkg_t *newpkg)
{ {
char line[PATH_MAX];
char *ptr = NULL; char *ptr = NULL;
char *key = NULL; char *key = NULL;
int linenum = 0; int linenum = 0;
struct archive_read_buffer buf;
ALPM_LOG_FUNC; ALPM_LOG_FUNC;
/* loop until we reach EOF (where archive_fgets will return NULL) */ memset(&buf, 0, sizeof(buf));
while(_alpm_archive_fgets(line, PATH_MAX, a) != NULL) { /* 512K for a line length seems reasonable */
buf.max_line_size = 512 * 1024;
/* loop until we reach EOF or other error */
while(_alpm_archive_fgets(a, &buf) == ARCHIVE_OK) {
char *line = _alpm_strtrim(buf.line);
linenum++; linenum++;
_alpm_strtrim(line);
if(strlen(line) == 0 || line[0] == '#') { if(strlen(line) == 0 || line[0] == '#') {
continue; continue;
} }

View file

@ -219,8 +219,8 @@ static int sync_db_populate(pmdb_t *db)
} }
#define READ_NEXT(s) do { \ #define READ_NEXT(s) do { \
if(_alpm_archive_fgets(s, sizeof(s), archive) == NULL) goto error; \ if(_alpm_archive_fgets(archive, &buf) != ARCHIVE_OK) goto error; \
_alpm_strtrim(s); \ s = _alpm_strtrim(buf.line); \
} while(0) } while(0)
#define READ_AND_STORE(f) do { \ #define READ_AND_STORE(f) do { \
@ -238,10 +238,10 @@ static int sync_db_populate(pmdb_t *db)
static int sync_db_read(pmdb_t *db, struct archive *archive, struct archive_entry *entry) static int sync_db_read(pmdb_t *db, struct archive *archive, struct archive_entry *entry)
{ {
char line[1024];
const char *entryname = NULL; const char *entryname = NULL;
char *filename, *pkgname, *p, *q; char *filename, *pkgname, *p, *q;
pmpkg_t *pkg; pmpkg_t *pkg;
struct archive_read_buffer buf;
ALPM_LOG_FUNC; ALPM_LOG_FUNC;
@ -260,6 +260,10 @@ static int sync_db_read(pmdb_t *db, struct archive *archive, struct archive_entr
_alpm_log(PM_LOG_FUNCTION, "loading package data from archive entry %s\n", _alpm_log(PM_LOG_FUNCTION, "loading package data from archive entry %s\n",
entryname); entryname);
memset(&buf, 0, sizeof(buf));
/* 512K for a line length seems reasonable */
buf.max_line_size = 512 * 1024;
/* get package and db file names */ /* get package and db file names */
STRDUP(pkgname, entryname, RET_ERR(PM_ERR_MEMORY, -1)); STRDUP(pkgname, entryname, RET_ERR(PM_ERR_MEMORY, -1));
p = pkgname + strlen(pkgname); p = pkgname + strlen(pkgname);
@ -279,8 +283,9 @@ static int sync_db_read(pmdb_t *db, struct archive *archive, struct archive_entr
if(strcmp(filename, "desc") == 0 || strcmp(filename, "depends") == 0 if(strcmp(filename, "desc") == 0 || strcmp(filename, "depends") == 0
|| strcmp(filename, "deltas") == 0) { || strcmp(filename, "deltas") == 0) {
while(_alpm_archive_fgets(line, sizeof(line), archive) != NULL) { while(_alpm_archive_fgets(archive, &buf) == ARCHIVE_OK) {
_alpm_strtrim(line); char *line = _alpm_strtrim(buf.line);
if(strcmp(line, "%NAME%") == 0) { if(strcmp(line, "%NAME%") == 0) {
READ_NEXT(line); READ_NEXT(line);
if(strcmp(line, pkg->name) != 0) { if(strcmp(line, pkg->name) != 0) {

View file

@ -771,33 +771,89 @@ int _alpm_test_md5sum(const char *filepath, const char *md5sum)
return(ret); return(ret);
} }
char *_alpm_archive_fgets(char *line, size_t size, struct archive *a) /* Note: does NOT handle sparse files on purpose for speed. */
int _alpm_archive_fgets(struct archive *a, struct archive_read_buffer *b)
{ {
/* for now, just read one char at a time until we get to a char *i = NULL;
* '\n' char. we can optimize this later with an internal int64_t offset;
* buffer. */ int done = 0;
/* leave room for zero terminator */
char *last = line + size - 1;
char *i;
for(i = line; i < last; i++) { while(1) {
int ret = archive_read_data(a, i, 1); /* have we processed this entire block? */
/* special check for first read- if null, return null, if(b->block + b->block_size == b->block_offset) {
* this indicates EOF */ if(b->ret == ARCHIVE_EOF) {
if(i == line && (ret <= 0 || *i == '\0')) { /* reached end of archive on the last read, now we are out of data */
return(NULL); goto cleanup;
}
/* zero-copy - this is the entire next block of data. */
b->ret = archive_read_data_block(a, (void*)&b->block,
&b->block_size, &offset);
b->block_offset = b->block;
/* error or end of archive with no data read, cleanup */
if(b->ret < ARCHIVE_OK ||
(b->block_size == 0 && b->ret == ARCHIVE_EOF)) {
goto cleanup;
}
} }
/* check if read value was null or newline */
if(ret <= 0 || *i == '\0' || *i == '\n') { /* loop through the block looking for EOL characters */
last = i + 1; for(i = b->block_offset; i < (b->block + b->block_size); i++) {
break; /* check if read value was null or newline */
if(*i == '\0' || *i == '\n') {
done = 1;
break;
}
}
/* allocate our buffer, or ensure our existing one is big enough */
if(!b->line) {
/* set the initial buffer to the read block_size */
CALLOC(b->line, b->block_size + 1, sizeof(char),
RET_ERR(PM_ERR_MEMORY, -1));
b->line_size = b->block_size + 1;
b->line_offset = b->line;
} else {
size_t needed = (b->line_offset - b->line) + (i - b->block_offset) + 1;
if(needed > b->max_line_size) {
RET_ERR(PM_ERR_MEMORY, -1);
}
if(needed > b->line_size) {
/* need to realloc + copy data to fit total length */
char *new;
CALLOC(new, needed, sizeof(char), RET_ERR(PM_ERR_MEMORY, -1));
memcpy(new, b->line, b->line_size);
b->line_size = needed;
b->line_offset = new + (b->line_offset - b->line);
free(b->line);
b->line = new;
}
}
if(done) {
size_t len = i - b->block_offset;
memcpy(b->line_offset, b->block_offset, len);
b->line_offset[len] = '\0';
b->block_offset = ++i;
/* this is the main return point; from here you can read b->line */
return(ARCHIVE_OK);
} else {
/* we've looked through the whole block but no newline, copy it */
size_t len = b->block + b->block_size - b->block_offset;
memcpy(b->line_offset, b->block_offset, len);
b->line_offset += len;
b->block_offset = i;
} }
} }
/* always null terminate the buffer */ cleanup:
*last = '\0'; {
int ret = b->ret;
return(line); FREE(b->line);
memset(b, 0, sizeof(b));
return(ret);
}
} }
int _alpm_splitname(const char *target, pmpkg_t *pkg) int _alpm_splitname(const char *target, pmpkg_t *pkg)

View file

@ -59,6 +59,22 @@
_alpm_log(PM_LOG_DEBUG, "returning error %d from %s : %s\n", err, __func__, alpm_strerrorlast()); \ _alpm_log(PM_LOG_DEBUG, "returning error %d from %s : %s\n", err, __func__, alpm_strerrorlast()); \
return(ret); } while(0) return(ret); } while(0)
/**
* Used as a buffer/state holder for _alpm_archive_fgets().
*/
struct archive_read_buffer {
char *line;
char *line_offset;
size_t line_size;
size_t max_line_size;
char *block;
char *block_offset;
size_t block_size;
int ret;
};
int _alpm_makepath(const char *path); int _alpm_makepath(const char *path);
int _alpm_makepath_mode(const char *path, mode_t mode); int _alpm_makepath_mode(const char *path, mode_t mode);
int _alpm_copyfile(const char *src, const char *dest); int _alpm_copyfile(const char *src, const char *dest);
@ -76,7 +92,7 @@ char *_alpm_filecache_find(const char *filename);
const char *_alpm_filecache_setup(void); const char *_alpm_filecache_setup(void);
int _alpm_lstat(const char *path, struct stat *buf); int _alpm_lstat(const char *path, struct stat *buf);
int _alpm_test_md5sum(const char *filepath, const char *md5sum); int _alpm_test_md5sum(const char *filepath, const char *md5sum);
char *_alpm_archive_fgets(char *line, size_t size, struct archive *a); int _alpm_archive_fgets(struct archive *a, struct archive_read_buffer *b);
int _alpm_splitname(const char *target, pmpkg_t *pkg); int _alpm_splitname(const char *target, pmpkg_t *pkg);
unsigned long _alpm_hash_sdbm(const char *str); unsigned long _alpm_hash_sdbm(const char *str);