/* Copyright (c) 2020 AlaskanEmily
 *
 * This software is provided 'as-is', without any express or implied warranty.
 * In no event will the authors be held liable for any damages arising from
 * the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *   claim that you wrote the original software. If you use this software in a
 *   product, an acknowledgment in the product documentation would be
 *   appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *   misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

#include "sl_s_parse.h"

#include <string.h>

#ifdef SL_S_UNICODE
#define SL_S_UNICODE_CHOOSE(YES, NO) YES
#define SL_S_UNICODE_ONLY(X) X
#define SL_S_NOT_UNICODE_ONLY(X)
#else
#define SL_S_UNICODE_CHOOSE(YES, NO) NO
#define SL_S_UNICODE_ONLY(X)
#define SL_S_NOT_UNICODE_ONLY(X) X
#endif

/*****************************************************************************/

const int sl_s_unicode_enabled = SL_S_UNICODE_CHOOSE(1, 0);

/*****************************************************************************/

const int sl_s_parse_info_enabled =
#ifdef SL_S_NO_PARSE_INFO
    0;
#else
    1;
#endif

/*****************************************************************************/

static struct SL_S_List *sl_s_parse_list(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len);

/*****************************************************************************/

static void sl_s_parse_value(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len,
    void **to);

/*****************************************************************************/

#ifndef SL_S_NO_PARSE_INFO
static unsigned sl_s_count_newlines(const char *src, unsigned end){
    register sl_s_len_t n;
    register unsigned i;
    for(i = n = 0; i < end && n < ~(sl_s_len_t)0; i++){
        if(src[i] == '\n')
            n++;
    }
    return n;
}
#endif

/*****************************************************************************/

static unsigned sl_s_begin_comment_length(unsigned codepoint){
    if(codepoint == ';')
        return 1;
#ifdef SL_S_UNICODE
    /* Alternative comment characters */
    switch(codepoint){
        case 0xFE14: /* FALLTHROUGH */
        case 0xFE54: /* FALLTHROUGH */
        case 0xFE1B: /* FALLTHROUGH */
            return SL_S_UTF8Length(codepoint);
    }
#endif
    return 0;
}

/*****************************************************************************/
/**
 * Returns non-zero if there is a comment-starting character at the index.
 *
 * The exact return value will be the number of bytes in the unicode codepoint.
 */
static int sl_s_begin_comment(const char *src,
    unsigned index,
    unsigned len){
    
    unsigned codepoint;
    
    if(index >= len)
        return 0;
    
#ifdef SL_S_UNICODE
    if(SL_S_UTF8Decode(src, index, len, &codepoint) == 0)
        return 0;
#else
    codepoint = src[index];
#endif
    return sl_s_begin_comment_length(codepoint);
}

/*****************************************************************************/

static unsigned sl_s_skip_whitespace(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned index,
    unsigned len){
    
    unsigned x;
    
begin_whitespace:
    if(index >= len)
        return len;
    
    if((x = sl_s_begin_comment(src, index, len)) != 0){
        index += x;
        while(index < len && src[index] != '\n'){
            index++;
        }
        goto begin_whitespace;
    }
    if((x = SL_S_WhitespaceLength(src, index, len)) != 0){
#ifndef SL_S_NO_PARSE_INFO
        if(x == 1 && src[index] == '\n')
            (*in_out_line)++;
#endif
        index += x;
        goto begin_whitespace;
    }
    return index;
}

/*****************************************************************************/

SL_S_PURE_FUNC(unsigned) SL_S_UTF8Decode(const char *src,
    unsigned index,
    unsigned len,
    unsigned *out_code_unit){
    
#ifdef SL_S_UNICODE
    unsigned char ext_len;
    unsigned char x;
    unsigned codepoint;
#endif
    
    if(index >= len)
        return 0;
    
#ifdef SL_S_UNICODE
    x = src[index++];
    if((x & 0x80) == 0){
        /* ASCII-range */
        *out_code_unit = x;
        return 1;
    }
    else if((x & 0xF8) == 0xF0){
        /* Parse level 4. */
        ext_len = 3;
        codepoint = (x & 0x07);
    }
    else if((x & 0xF0) == 0xE0){
        ext_len = 2;
        codepoint = (x & 0x0F);
    }
    else if((x & 0xE0) == 0xC0){
        ext_len = 1;
        codepoint = (x & 0x1F);
    }
    else{
        /* Stuck in the middle (of a unit sequence) with you. */
        *out_code_unit = 0xFFFD;
        return 1;
    }
    
    /* Check for cutoff UTF8 sequences */
    if(index + ext_len > len){
        *out_code_unit = 0xFFFD;
        return 1;
    }
    
    x = 0;
    do{
        codepoint <<= 6;
        codepoint |= src[index + x] & 0x3F;
    }while(++x != ext_len);
    
    *out_code_unit = codepoint;
    return ext_len + 1;
#else
    *out_code_unit = src[index];
    return 1;
#endif
}

/*****************************************************************************/

SL_S_PURE_FUNC(unsigned) SL_S_UTF8Length(unsigned codepoint){
#ifdef SL_S_UNICODE
    if(codepoint < 0x00000080)
        return 1;
    if(codepoint < 0x00000800)
        return 2;
    if(codepoint < 0x00010000)
        return 3;
    if(codepoint < 0x00110000)
        return 4;
    /* Invalid codepoint. */
    return 3; /* Size of 0xFFFD */
#else
    (void)codepoint;
    return 1;
#endif
}

/*****************************************************************************/

SL_S_PURE_FUNC(unsigned) SL_S_UTF8Encode(unsigned codepoint, char *to){
#ifdef SL_S_UNICODE
    if(codepoint >= 0x00110000)
        return SL_S_UTF8Encode(0xFFFD, to);
    
    switch(SL_S_UTF8Length(codepoint)){
        case 1:
            to[0] = codepoint;
            return 1;
        case 2:
            to[0] = 0xC0 | (codepoint >> 6);
            to[1] = 0x80 | (codepoint & 0x3F);
            return 2;
        case 3:
            to[0] = 0xE0 | (codepoint >> 12);
            to[1] = 0x80 | ((codepoint >> 6) & 0x3F);
            to[2] = 0x80 | (codepoint & 0x3F);
            return 3;
        case 4:
            to[0] = 0xF0 | (codepoint >> 24);
            to[1] = 0x80 | ((codepoint >> 12) & 0x3F);
            to[2] = 0x80 | ((codepoint >> 6) & 0x3F);
            to[3] = 0x80 | (codepoint & 0x3F);
            return 4;
        default:
#if (defined __GNUC__)
            __builtin_unreachable();
#elif (defined _MSC_VER
            __assume(0);
#endif
            return 1;
    }
#else
    to[0] = codepoint;
    return 1;
#endif
}

/*****************************************************************************/

static unsigned sl_s_whitespace_length(unsigned codepoint){
    unsigned x;
    
    x = sl_s_begin_comment_length(codepoint);
    if(x == 0){
        switch(codepoint){
#ifdef SL_S_UNICODE
            case 0x0085: /* FALLTHROUGH */
            case 0x00A0: /* FALLTHROUGH */
            case 0x1680: /* FALLTHROUGH */
            case 0x2000: /* FALLTHROUGH */
            case 0x2001: /* FALLTHROUGH */
            case 0x2002: /* FALLTHROUGH */
            case 0x2003: /* FALLTHROUGH */
            case 0x2004: /* FALLTHROUGH */
            case 0x2005: /* FALLTHROUGH */
            case 0x2006: /* FALLTHROUGH */
            case 0x2007: /* FALLTHROUGH */
            case 0x2008: /* FALLTHROUGH */
            case 0x2009: /* FALLTHROUGH */
            case 0x200A: /* FALLTHROUGH */
            case 0x2028: /* FALLTHROUGH */
            case 0x2029: /* FALLTHROUGH */
            case 0x202F: /* FALLTHROUGH */
            case 0x205F: /* FALLTHROUGH */
            case 0x3000: /* FALLTHROUGH */
            case 0xFEFF: /* FALLTHROUGH */
            
            case 0x2409: /* FALLTHROUGH */
            case 0x240A: /* FALLTHROUGH */
            case 0x240B: /* FALLTHROUGH */
            case 0x240C: /* FALLTHROUGH */
            case 0x240D: /* FALLTHROUGH */
            case 0x2420: /* FALLTHROUGH */
            case 0x2423: /* FALLTHROUGH */
            case 0x2424:
                x = SL_S_UTF8Length(codepoint);
                break;
#endif    
            case ' ': /* FALLTHROUGH */
            case '\t': /* FALLTHROUGH */
            case '\n': /* FALLTHROUGH */
            case '\r': /* FALLTHROUGH */
            case '\v':
                x = 1;
                break;
        }
    }
    return x;
}


/*****************************************************************************/

SL_S_PURE_FUNC(unsigned) SL_S_WhitespaceLength(const char *src,
    unsigned index,
    unsigned len){
    
    unsigned codepoint;
    
    if(index >= len)
        return 0;
    
#ifdef SL_S_UNICODE
    if(SL_S_UTF8Decode(src, index, len, &codepoint) == 0)
        return 0;
#else
    codepoint = src[index];
#endif
    
    return sl_s_whitespace_length(codepoint);
}

/*****************************************************************************/

SL_S_PURE_FUNC(unsigned) SL_S_SkipWhitespace(const char *src,
    unsigned index,
    unsigned len){
    
#ifndef SL_S_NO_PARSE_INFO
    unsigned unused;
#endif
    
    return sl_s_skip_whitespace(src,
#ifndef SL_S_NO_PARSE_INFO
        &unused,
#endif
        index,
        len);
}

/*****************************************************************************/
/**
 * Returns 0 if the following codepoint is not a space.
 *
 * The codepoint could be a comment, so it must not just be skipped.
 */
static int sl_s_is_whitespace(const char *src,
    unsigned index,
    unsigned len){
    
    if(SL_S_WhitespaceLength(src, index, len) ||
        sl_s_begin_comment(src, index, len))
        return 1;
    else
        return 0;
}

/*****************************************************************************/

static void sl_s_parse_unquoted_atom(const char *src,
    unsigned *in_out_index,
    unsigned len,
    struct SL_S_Atom *to){
    
    register sl_s_len_t atom_len;
    register unsigned index;
#ifdef SL_S_UNICODE
    unsigned codepoint;
    unsigned short x;
#endif
    
    /* Compute the atom size. */
    index = *in_out_index;
    atom_len = 0;
    do{
#ifdef SL_S_UNICODE
        /* Decode a UTF8 codepoint. */
        x = SL_S_UTF8Decode(src, index, len, &codepoint);
        
        /* Check for a UTF8 character which wasn't cutoff in the input, but
         * which can't be fit into . */
        if(SL_S_MAX_LEN - x < atom_len)
            break;
        
        /* The sizes might not be equal for an invalid UTF8 codepoint. */
        index += x;
        atom_len += SL_S_UTF8Length(codepoint);
#else
        atom_len++;
        index++;
#endif
    }while(index < len &&
        atom_len < SL_S_MAX_LEN &&
        !sl_s_is_whitespace(src, index, len) &&
        src[index] != '"' &&
        src[index] != '(' &&
        src[index] != ')');
    
    to->ref = 1;
    to->text = SL_S_Malloc(atom_len + 1);
    to->len = atom_len;
#ifdef SL_S_UNICODE
    atom_len = 0;
    do{
        x = SL_S_UTF8Decode(src, *in_out_index, len, &codepoint);
        *in_out_index += x;
        atom_len += SL_S_UTF8Encode(codepoint, to->text + atom_len);
    }while(*in_out_index < index);
#else
    SL_S_MemCopy(to->text, src + *in_out_index, atom_len);
    *in_out_index += atom_len;
#endif
    to->text[atom_len] = '\0';
}

/*****************************************************************************/

static void sl_s_parse_quoted_atom(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len,
    struct SL_S_Atom *to){
    
    sl_s_len_t atom_len;
    register unsigned index;
    unsigned char last_was_slash, has_escapes;
    SL_S_NOT_UNICODE_ONLY(register) unsigned codepoint;
#ifdef SL_S_UNICODE
    unsigned short x;
#endif
    
    atom_len = 0;
    last_was_slash = 0;
    has_escapes = 0;
    index = *in_out_index;

    do{
#ifdef SL_S_UNICODE
        x = SL_S_UTF8Decode(src, index, len, &codepoint);
        if(SL_S_MAX_LEN - x < len)
            break;
        index += x;
        /* atom_len += SL_S_UTF8Length(codepoint); */
        /* Set the escape flag on an invalid codepoint, too. */
        if(x == 1 && (codepoint & 0x80) != 0)
            has_escapes = 1;
#else
        codepoint = src[index];
        index++;
        /* atom_len++; */
#endif
        if(last_was_slash){
            last_was_slash = 0;
        }
        else if(codepoint == '\\'){
            last_was_slash = 1;
            has_escapes = 1;
        }
        else if(codepoint == '"'){
            break;
        }
#ifndef SL_S_NO_PARSE_INFO
        /* Intentionally not an else if */
        if(codepoint == '\n'){
            (*in_out_line)++;
        }
#endif
        atom_len += SL_S_UNICODE_CHOOSE(1, SL_S_UTF8Length(codepoint));
    }while(index < len && atom_len < SL_S_MAX_LEN);
    
    to->ref = 1;
    to->text = SL_S_Malloc(atom_len + 1);
    
    if(!has_escapes){
        SL_S_MemCopy(to->text, src + *in_out_index, atom_len);
        (*in_out_index) += atom_len + 1;
        to->len = atom_len;
        return;
    }
    
    last_was_slash = 0;
    to->len = 0;
    
    do{
#ifdef SL_S_UNICODE
        x = SL_S_UTF8Decode(src, *in_out_index, len, &codepoint);
        if(SL_S_MAX_LEN - x < len)
            break;
        *in_out_index += x;
#else
        codepoint = src[*in_out_index];
        (*in_out_index)++;
#endif
        if(last_was_slash){
            last_was_slash = 0;
            switch(codepoint){
#define SL_S_ESCAPE(CHAR, VAL) \
    case CHAR: \
        codepoint = (VAL); \
        SL_S_UNICODE_ONLY(x = 1;) \
        break 
                SL_S_ESCAPE('a', 0x07);
                SL_S_ESCAPE('b', 0x08);
                SL_S_ESCAPE('t', 0x09);
                SL_S_ESCAPE('n', 0x0A);
                SL_S_ESCAPE('v', 0x0B);
                SL_S_ESCAPE('f', 0x0C);
                SL_S_ESCAPE('r', 0x0D);
#undef SL_S_ESCAPE
            }
        }
        else if(codepoint == '\\'){
            last_was_slash = 1;
            continue;
        }
        else if(codepoint == '"'){
            break;
        }
        
        to->len += SL_S_UTF8Encode(codepoint, to->text + to->len);
    }while(*in_out_index < len);
    
    to->text[to->len] = '\0';
}

/*****************************************************************************/

static struct SL_S_List *sl_s_parse_list(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len){
    
    struct SL_S_List *ret, **tail;
    tail = &ret;
    do{
        if(src[*in_out_index] == ')'){
            (*in_out_index)++;
            break;
        }
        
        *tail = SL_S_Malloc(sizeof(struct SL_S_List));
#ifndef SL_S_NO_PARSE_INFO
        if(*in_out_line > ~(sl_s_len_t)0)
            (*tail)->line = ~(sl_s_len_t)0;
        else
            (*tail)->line = *in_out_line;
#endif
        sl_s_parse_value(src,
#ifndef SL_S_NO_PARSE_INFO
            in_out_line,
#endif
            in_out_index,
            len,
            &((*tail)->head));
        tail = &((*tail)->tail);
        
        *in_out_index = sl_s_skip_whitespace(src,
#ifndef SL_S_NO_PARSE_INFO
            in_out_line,
#endif
            *in_out_index,
            len);
    }while(*in_out_index < len);
    *tail = NULL;
    return ret;
}

/*****************************************************************************/

static void sl_s_parse_atom(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len,
    struct SL_S_Atom *to){
    
#ifndef SL_S_NO_PARSE_INFO
    to->line = *in_out_line;
#endif
    if(src[*in_out_index] == '"'){
        (*in_out_index)++;
        sl_s_parse_quoted_atom(src,
#ifndef SL_S_NO_PARSE_INFO
            in_out_line,
#endif
            in_out_index,
            len,
            to);
    }
    else{
        sl_s_parse_unquoted_atom(src,
            in_out_index,
            len,
            to);
    }
}

/*****************************************************************************/

static void sl_s_parse_value(const char *src,
#ifndef SL_S_NO_PARSE_INFO
    unsigned *in_out_line,
#endif
    unsigned *in_out_index,
    unsigned len,
    void **to){
    
    void *value;
    unsigned tag;
    
    if(src[*in_out_index] == '('){
        tag = SL_S_LIST_TAG;
        (*in_out_index)++;
        *in_out_index = sl_s_skip_whitespace(src,
#ifndef SL_S_NO_PARSE_INFO
            in_out_line,
#endif
            *in_out_index,
            len);
        if(*in_out_index >= len)
            value = SL_S_NIL;
        else
            value = sl_s_parse_list(src,
#ifndef SL_S_NO_PARSE_INFO
                in_out_line,
#endif
                in_out_index,
                len);
    }
    else{
        value = SL_S_Malloc(sizeof(struct SL_S_Atom));
        tag = SL_S_ATOM_TAG;
        sl_s_parse_atom(src,
#ifndef SL_S_NO_PARSE_INFO
            in_out_line,
#endif
            in_out_index,
            len,
            value);
    }
    *to = ((char*)value) + tag;
}

/*****************************************************************************/

SL_S_FUNC(int) SL_S_ParseValue(const char *src,
    unsigned *in_out_index,
    unsigned len,
    void **out){
    
    unsigned backup_index;
#ifndef SL_S_NO_PARSE_INFO
    unsigned line;
#endif

    if(!in_out_index){
        in_out_index = &backup_index;
        backup_index = 0;
    }

#ifndef SL_S_NO_PARSE_INFO
    line = 1 + sl_s_count_newlines(src, *in_out_index);
#endif

    if((*in_out_index = sl_s_skip_whitespace(src,
#ifndef SL_S_NO_PARSE_INFO
        &line,
#endif
        *in_out_index,
        len)) >= len){

        return 1;
    }
    else{
        sl_s_parse_value(src,
#ifndef SL_S_NO_PARSE_INFO
            &line,
#endif
            in_out_index,
            len,
            out);
        return 0;
    }
}

/*****************************************************************************/

SL_S_FUNC(int) SL_S_ParseList(const char *src,
    unsigned *in_out_index,
    unsigned len,
    struct SL_S_List **to){
    
    unsigned backup_index;
#ifndef SL_S_NO_PARSE_INFO
    unsigned line;
#endif

    if(!in_out_index){
        in_out_index = &backup_index;
        backup_index = 0;
    }

#ifndef SL_S_NO_PARSE_INFO
    line = 1 + sl_s_count_newlines(src, *in_out_index);
#endif
    
    *to = sl_s_parse_list(src,
#ifndef SL_S_NO_PARSE_INFO
        &line,
#endif
        in_out_index,
        len);
    return 0;
}

/*****************************************************************************/

SL_S_FUNC(int) SL_S_ParseAtom(const char *src,
    unsigned *in_out_index,
    unsigned len,
    struct SL_S_Atom *to){
    
    unsigned backup_index;
#ifndef SL_S_NO_PARSE_INFO
    unsigned line;
#endif

    if(!in_out_index){
        in_out_index = &backup_index;
        backup_index = 0;
    }

#ifndef SL_S_NO_PARSE_INFO
    line = 1 + sl_s_count_newlines(src, *in_out_index);
#endif
    
    sl_s_parse_atom(src,
#ifndef SL_S_NO_PARSE_INFO
        &line,
#endif
        in_out_index,
        len,
        to);
    return 0;
}
