office-gobmx/solenv/bin/concat-deps.c

880 lines
23 KiB
C

/*
* Copyright (C) 2011 Norbert Thiebaud
* License: GPLv3
*/
/* define to activate stats reporting on hash usage*/
/* #define HASH_STAT */
/* ===============================================
* Set-up: defines to identify the system and system related properties
* ===============================================
*/
#ifdef __APPLE__
#ifdef __x86_64__
#define CORE_BIG_ENDIAN 0
#define CORE_LITTLE_ENDIAN 1
#define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */
#else
#define CORE_BIG_ENDIAN 1
#define CORE_LITTLE_ENDIAN 0
#define USE_MEMORY_ALIGNMENT 4
#endif
#endif
#ifdef _AIX
#define CORE_BIG_ENDIAN 1
#define CORE_LITTLE_ENDIAN 0
#define USE_MEMORY_ALIGNMENT 4
#endif /* Def _AIX */
#ifdef __CYGWIN__
#define __windows
#define CORE_BIG_ENDIAN 0
#define CORE_LITTLE_ENDIAN 1
#define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */
#endif /* Def __CYGWIN__ */
#if defined(__linux) || defined(__OpenBSD__) || \
defined(__FreeBSD__) || defined(__NetBSD__) || \
defined(__DragonFly__)
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define CORE_BIG_ENDIAN 0
#define CORE_LITTLE_ENDIAN 1
#define USE_MEMORY_ALIGNMENT 64
#else /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */
#if __BYTE_ORDER == __BIG_ENDIAN
#define CORE_BIG_ENDIAN 1
#define CORE_LITTLE_ENDIAN 0
#define USE_MEMORY_ALIGNMENT 4
#endif /* __BYTE_ORDER == __BIG_ENDIAN */
#endif /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */
#endif /* Def __linux || Def *BSD */
#ifdef __sun
#ifdef __sparc
#define CORE_BIG_ENDIAN 1
#define CORE_LITTLE_ENDIAN 0
#define USE_MEMORY_ALIGNMENT 4
#else /* Ndef __sparc */
#define CORE_BIG_ENDIAN 0
#define CORE_LITTLE_ENDIAN 1
#define USE_MEMORY_ALIGNMENT 4
#endif /* Ndef __sparc */
#endif /* Def __sun */
/* Note USE_MEMORY_ALIGNMENT is 4 for platform that allow short non-aligned but required int access to be aligned (e.g sparc, ppc, zos..)
* USE_MEMORY_ALIGNMENT is 2 for platform that require short and int access to be aligned (e.g hppa )
* if the platform does not have alignment requirement (x86/amd64) use a big value (i.e > 16)
*/
#ifndef USE_MEMORY_ALIGNMENT
#error "USE_MEMORY_ALIGNMENT must be defined to the proper alignment value for the platform"
#endif
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <ctype.h>
#ifdef __windows
#include <io.h>
#else
#include <unistd.h>
#endif
/* modes */
#ifdef __windows
#define FILE_O_RDONLY _O_RDONLY
#define FILE_O_BINARY _O_BINARY
#else /* not windaube */
#define FILE_O_RDONLY O_RDONLY
#define FILE_O_BINARY 0
#endif /* not windaube */
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#ifdef __GNUC__
#define clz __builtin_clz
#else
static inline int clz(unsigned int value)
{
int result = 32;
while(value)
{
value >>= 1;
result -= 1;
}
return result;
}
#endif
#if (USE_MEMORY_ALIGNMENT > 4)
#define get_unaligned_uint(str) (*(unsigned int*)(str))
#else
static inline unsigned int get_unaligned_uint(const unsigned char* cursor)
{
unsigned int result;
memcpy(&result, cursor, sizeof(unsigned int));
return result;
}
#endif
/* ===============================================
* memory pool for fast fix-size allocation (non-tread-safe)
* ===============================================
*/
struct pool
{
void* head_free; /**< head of a linked list of freed element */
char* fresh; /**< top of a memory block to dig new element */
char* tail; /**< to detect end of extent... when fresh pass tail */
void* extent; /**< pointer to the primary extent block */
int size_elem; /**< size of an element. */
int primary; /**< primary allocation in bytes */
int secondary; /**< secondary allocation in bytes */
};
#define POOL_ALIGN_INCREMENT 8 /**< Alignement, must be a power of 2 and of size > to sizeof(void*) */
static void* pool_take_extent(struct pool* pool, int allocate)
{
unsigned int size = 0;
void* extent;
void* data = NULL;
if(pool->extent)
{
fputs("taking a pool extent\n", stderr);
/* we already have an extent, so this is a secondary */
if(pool->secondary)
{
size = pool->secondary;
}
}
else
{
assert(pool->primary);
size = pool->primary;
}
if(size)
{
extent = malloc(size);
if(extent)
{
*(void**)extent = pool->extent;
pool->extent = extent;
if(allocate)
{
data = ((char*)extent) + POOL_ALIGN_INCREMENT;
pool->fresh = ((char*)data) + pool->size_elem;
pool->tail = pool->fresh + (size - pool->size_elem);
}
else
{
pool->fresh = ((char*)extent) + POOL_ALIGN_INCREMENT;
pool->tail = pool->fresh + (size - pool->size_elem);
}
}
}
return data;
}
/* Create a memory pool for fix size objects
* this is a simplified implementation that
* is _not_ thread safe.
*/
struct pool* pool_create(int size_elem, int flags, int primary, int secondary)
{
struct pool* pool;
assert(primary > 0);
assert(secondary >= 0);
assert(size_elem > 0);
pool = (struct pool*)calloc(1, sizeof(struct pool));
if(!pool) return NULL;
/* Adjust the element size so that it be aligned, and so that an element could
* at least contain a void*
*/
pool->size_elem = size_elem = (size_elem + POOL_ALIGN_INCREMENT - 1) & ~(POOL_ALIGN_INCREMENT - 1);
pool->primary = (size_elem * primary) + POOL_ALIGN_INCREMENT;
pool->secondary = secondary > 0 ? (size_elem * secondary) + POOL_ALIGN_INCREMENT : 0;
pool_take_extent(pool, FALSE);
return pool;
}
void pool_destroy(struct pool* pool)
{
void* extent;
void* next;
if(pool != NULL)
{
extent = pool->extent;
while(extent)
{
next = *(void**)extent;
free(extent);
extent = next;
}
free(pool);
}
}
static inline void* pool_alloc(struct pool* pool)
{
void* data;
data = pool->head_free;
if(data == NULL)
{
/* we have no old-freed elem */
if(pool->fresh <= pool->tail)
{
/* pick a slice of the current extent */
data = (void*)pool->fresh;
pool->fresh += pool->size_elem;
}
else
{
/* allocate a new extent */
data = pool_take_extent(pool, TRUE);
}
}
else
{
/* re-used old freed element by chopipng the head of the free list */
pool->head_free = *(void**)data;
}
return data;
}
static inline void pool_free(struct pool* pool, void* data)
{
assert(pool && data);
/* stack on top of the free list */
*(void**)data = pool->head_free;
pool->head_free = data;
}
/* ===============================================
* Hash implementation custumized to be just tracking
* a unique list of string (i.e no data associated
* with the key, no need for retrieval, etc..
*
* This is tuned for the particular use-case we have here
* measures in tail_build showed that
* we can get north of 4000 distinct values stored in a hash
* the collision rate is at worse around 2%
* the collision needing an expensive memcmp to resolve
* have a rate typically at 1 per 1000
* for tail_build we register 37229 unique key
* with a total of 377 extra memcmp needed
* which is completely negligible compared to the
* number of memcmp required to eliminate duplicate
* entry (north of 2.5 millions for tail_build)
* ===============================================
*/
struct hash_elem
{
struct hash_elem* next;
const char* key;
int key_len;
};
struct hash
{
struct hash_elem** array;
struct pool* elems_pool;
int flags;
unsigned int used;
unsigned int size;
unsigned int load_limit;
#ifdef HASH_STAT
int stored;
int collisions;
int cost;
int memcmp;
#endif
};
#define HASH_F_NO_RESIZE (1<<0)
/* The following hash_compute function was adapted from :
* lookup3.c, by Bob Jenkins, May 2006, Public Domain.
*
* The changes from the original are mostly cosmetic
*/
#define hashsize(n) (1<<(n))
#define hashmask(n) (hashsize(n)-1)
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
#if CORE_BIG_ENDIAN
#define MASK_C1 0xFFFFFF00
#define MASK_C2 0xFFFF0000
#define MASK_C3 0xFF000000
#else
#if CORE_LITTLE_ENDIAN
#define MASK_C1 0xFFFFFF
#define MASK_C2 0xFFFF
#define MASK_C3 0xFF
#else
#error "Missing Endianness definition"
#endif
#endif
#define mix(a,b,c) \
{ \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}
#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}
static unsigned int hash_compute( struct hash* hash, const char* key, int length)
{
unsigned int a;
unsigned int b;
unsigned int c; /* internal state */
const unsigned char* uk = (const unsigned char*)key;
/* Set up the internal state */
a = b = c = 0xdeadbeef + (length << 2);
/* we use this to 'hash' full path with mostly a common root
* let's now waste too much cycles hashing mostly constant stuff
*/
if(length > 36)
{
uk += length - 36;
length = 36;
}
/*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
while (length > 12)
{
a += get_unaligned_uint(uk);
b += get_unaligned_uint(uk+4);
c += get_unaligned_uint(uk+8);
mix(a,b,c);
length -= 12;
uk += 12;
}
/*----------------------------- handle the last (probably partial) block */
/* Note: we possibly over-read, which would trigger complaint from VALGRIND
* but we mask the undefined stuff if any, so we are still good, thanks
* to alignment of memory allocation and tail-memory managment overhead
* we always can read 3 bytes past the official end without triggering
* a segfault -- if you find a platform/compiler couple for which that postulat
* is false, then you just need to over-allocate by 2 more bytes in file_load()
* file_load already over-allocate by 1 to sitck a \0 at the end of the buffer.
*/
switch(length)
{
case 12: c+=get_unaligned_uint(uk+8); b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break;
case 11: c+=get_unaligned_uint(uk+8) & MASK_C1; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break;
case 10: c+=get_unaligned_uint(uk+8) & MASK_C2; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break;
case 9 : c+=get_unaligned_uint(uk+8) & MASK_C3; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break;
case 8 : b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break;
case 7 : b+=get_unaligned_uint(uk+4) & MASK_C1; a+=get_unaligned_uint(uk); break;
case 6 : b+=get_unaligned_uint(uk+4) & MASK_C2; a+=get_unaligned_uint(uk); break;
case 5 : b+=get_unaligned_uint(uk+4) & MASK_C3; a+=get_unaligned_uint(uk); break;
case 4 : a+=get_unaligned_uint(uk); break;
case 3 : a+=get_unaligned_uint(uk) & MASK_C1; break;
case 2 : a+=get_unaligned_uint(uk) & MASK_C2; break;
case 1 : a+=get_unaligned_uint(uk) & MASK_C3; break;
case 0 : return c & hash->size; /* zero length strings require no mixing */
}
final(a,b,c);
return c & hash->size;
}
static void hash_destroy(struct hash* hash)
{
if(hash)
{
if(hash->array)
{
free(hash->array);
}
if(hash->elems_pool)
{
pool_destroy(hash->elems_pool);
}
free(hash);
}
}
static struct hash* hash_create(unsigned int size)
{
struct hash* hash;
assert(size > 0);
hash = calloc(1, sizeof(struct hash));
if(hash)
{
size += (size >> 2) + 1; /* ~ 75% load factor */
if(size >= 15)
{
hash->size = (((unsigned int)0xFFFFFFFF) >> clz((unsigned int)size));
}
else
{
hash->size = size = 15;
}
hash->load_limit = hash->size - (hash->size >> 2);
hash->used = 0;
hash->array = (struct hash_elem**)calloc(hash->size + 1, sizeof(struct hash_elem*));
if(hash->array == NULL)
{
hash_destroy(hash);
hash = NULL;
}
}
if(hash)
{
hash->elems_pool = pool_create(sizeof(struct hash_elem),
0, size, size << 1);
if(!hash->elems_pool)
{
hash_destroy(hash);
hash = NULL;
}
}
return hash;
}
static void hash_resize(struct hash* hash)
{
unsigned int old_size = hash->size;
unsigned int hashed;
struct hash_elem* hash_elem;
struct hash_elem* next;
struct hash_elem** array;
int i;
hash->size = (old_size << 1) + 1;
/* we really should avoid to get there... so print a message to alert of the condition */
fprintf(stderr, "resize hash %d -> %d\n", old_size, hash->size);
if(hash->size == old_size)
{
hash->flags |= HASH_F_NO_RESIZE;
return;
}
array = calloc(hash->size + 1, sizeof(struct hash_elem*));
if(array)
{
hash->load_limit = hash->size - (hash->size >> 2);
for(i=0; i <= old_size; i++)
{
hash_elem = (struct hash_elem*)hash->array[i];
while(hash_elem)
{
next = hash_elem->next;
hashed = hash_compute(hash, hash_elem->key, hash_elem->key_len);
hash_elem->next = array[hashed];
array[hashed] = hash_elem;
hash_elem = next;
}
}
free(hash->array);
hash->array = (struct hash_elem**)array;
}
else
{
hash->size = old_size;
hash->flags |= HASH_F_NO_RESIZE;
}
}
#ifdef HASH_STAT
static inline int compare_key(struct hash* hash, const char* a, const char* b, int len, int* cost)
{
*cost += 1;
hash->memcmp += 1;
return memcmp(a,b, len);
}
#else
#define compare_key(h,a,b,l,c) memcmp(a,b,l)
#endif
/* a customized hash_store function that just store the key and return
* TRUE if the key was effectively stored, or FALSE if the key was already there
*/
static int hash_store(struct hash* hash, const char* key, int key_len)
{
unsigned int hashed;
struct hash_elem* hash_elem;
int cost = 0;
hashed = hash_compute(hash, key, key_len);
#ifdef HASH_STAT
hash->stored += 1;
#endif
hash_elem = (struct hash_elem*)hash->array[hashed];
while(hash_elem && (hash_elem->key_len != key_len || compare_key(hash, hash_elem->key, key, key_len, &cost)))
{
hash_elem = hash_elem->next;
}
if(!hash_elem)
{
hash_elem = pool_alloc(hash->elems_pool);
if(hash_elem)
{
hash_elem->key = key;
hash_elem->key_len = key_len;
hash_elem->next = hash->array[hashed];
#ifdef HASH_STAT
if(hash_elem->next)
{
hash->collisions += 1;
hash->cost += cost;
}
#endif
hash->array[hashed] = hash_elem;
hash->used += 1;
if(hash->used > hash->load_limit)
{
hash_resize(hash);
}
}
return TRUE;
}
return FALSE;
}
static int file_stat(const char* name, struct stat* buffer_stat, int* rc)
{
int rc_local = 0;
rc_local = stat(name, buffer_stat);
if (rc_local < 0)
{
*rc = errno;
}
return rc_local;
}
static off_t file_get_size(const char* name, int* rc)
{
struct stat buffer_stat;
off_t size = -1;
if (!file_stat(name, &buffer_stat, rc))
{
if(S_ISREG(buffer_stat.st_mode))
{
size = buffer_stat.st_size;
}
else
{
*rc = EINVAL;
}
}
return size;
}
static char* file_load(const char* name, off_t* size, int* return_rc)
{
off_t local_size = 0;
int rc = 0;
char* buffer = NULL;
int fd;
assert(name != NULL);
if(!size)
{
size = &local_size;
}
*size = file_get_size(name, &rc);
if (!rc)
{
fd = open(name, FILE_O_RDONLY | FILE_O_BINARY);
if (!(fd == -1))
{
buffer = malloc((size_t)(*size + 1));
if (buffer == NULL)
{
rc = ENOMEM;
}
else
{
ssize_t i;
REDO:
i = read(fd, buffer, (size_t)(*size));
if(i == -1)
{
if(errno == EINTR)
{
goto REDO;
}
else
{
rc = errno;
}
}
else
{
if (i != *size)
{
rc = EIO;
}
}
close(fd);
buffer[*size] = 0;
}
}
}
if(rc && buffer)
{
free(buffer);
buffer = NULL;
}
if(return_rc)
{
*return_rc = rc;
}
return buffer;
}
static void _cancel_relative(char* base, char** ref_cursor, char** ref_cursor_out, char* end)
{
char* cursor = *ref_cursor;
char* cursor_out = *ref_cursor_out;
do
{
cursor += 3;
while(cursor_out > base && *--cursor_out != '/');
}
while(cursor + 3 < end && !memcmp(cursor, "/../", 4));
*ref_cursor = cursor;
*ref_cursor_out = cursor_out;
}
static int _process(struct hash* dep_hash, char* fn)
{
int rc;
char* buffer;
char* end;
char* cursor;
char* cursor_out;
char* base;
int continuation = 0;
char last_ns = 0;
off_t size;
buffer = file_load(fn, &size, &rc);
/* Note: yes we are going to leak 'buffer'
* this is on purpose, to avoid cloning the 'key' out of it
* and our special 'hash' just store the pointer to the key
* inside of buffer, hence it need to remain allocated
*/
if(!rc)
{
base = cursor_out = cursor = end = buffer;
end += size;
while(cursor < end)
{
if(*cursor == '\\')
{
continuation = 1;
*cursor_out++ = *cursor++;
}
else if(*cursor == '/')
{
if(cursor + 3 < end)
{
if(!memcmp(cursor, "/../", 4))
{
_cancel_relative(base, &cursor, &cursor_out, end);
}
}
*cursor_out++ = *cursor++;
}
else if(*cursor == '\n')
{
if(!continuation)
{
*cursor_out = 0;
if(base < cursor)
{
/* here we have a complete rule */
if(last_ns == ':')
{
/* if the rule ended in ':' that is a no-dep rule
* these are the one for which we want to filter
* duplicate out
*/
if(hash_store(dep_hash, base, (int)(cursor_out - base)))
{
puts(base);
putc('\n', stdout);
}
}
else
{
/* rule with dep, just write it */
puts(base);
putc('\n', stdout);
}
}
cursor += 1;
base = cursor_out = cursor;
}
else
{
/* here we have a '\' followed by \n this is a continuation
* i.e not a complete rule yet
*/
*cursor_out++ = *cursor++;
}
}
else
{
continuation = 0;
/* not using isspace() here save 25% of I refs and 75% of D refs based on cachegrind */
if(*cursor != ' ' && *cursor != '\n' && *cursor != '\t' )
{
last_ns = *cursor;
}
*cursor_out++ = *cursor++;
}
}
/* just in case the file did not end with a \n, there may be a pending rule */
if(base < cursor_out)
{
if(last_ns == ':')
{
if(hash_store(dep_hash, base, (int)(cursor_out - base)))
{
puts(base);
putc('\n', stdout);
}
}
else
{
puts(base);
putc('\n', stdout);
}
}
}
return rc;
}
static void _usage(void)
{
fputs("Usage: concat-deps <file that contains dep_files>\n", stderr);
}
#define kDEFAULT_HASH_SIZE 4096
int main(int argc, char** argv)
{
int rc = 0;
off_t in_list_size = 0;
char* in_list;
char* in_list_cursor;
char* in_list_end;
char* in_list_base;
struct hash* dep_hash;
char* base_dir;
if(argc < 2)
{
_usage();
return 1;
}
base_dir = getenv("SRCDIR");
if(!base_dir)
{
fputs("Error: SRCDIR si missing in the environement\n", stderr);
return 1;
}
in_list = file_load(argv[1], &in_list_size, &rc);
if(!rc)
{
dep_hash = hash_create( kDEFAULT_HASH_SIZE);
in_list_end = in_list + in_list_size;
in_list_base = in_list_cursor = in_list;
/* extract filename of dep file from a 'space' separated list */
while(*in_list_cursor)
{
if(*in_list_cursor == ' ' || *in_list_cursor == '\n')
{
*in_list_cursor = 0;
if(in_list_base < in_list_cursor)
{
rc = _process(dep_hash, in_list_base);
if(rc)
{
break;
}
}
in_list_cursor += 1;
in_list_base = in_list_cursor;
}
else
{
in_list_cursor += 1;
}
}
if(!rc)
{
/* catch the last entry in case the input did not terminate with a 'space' */
if(in_list_base < in_list_cursor)
{
rc = _process(dep_hash, in_list_base);
}
}
#ifdef HASH_STAT
fprintf(stderr, "stats: u:%d s:%d l:%d t:%d c:%d m:%d $:%d\n",
dep_hash->used, dep_hash->size, dep_hash->load_limit, dep_hash->stored,
dep_hash->collisions, dep_hash->memcmp, dep_hash->cost);
#endif
}
return rc;
}