CsvParser.php

<?php

/**
 *  @category   component
 *  @package    csvimport
 *  @author     Michael Martin martin@informatik.uni-leipzig.de
 */
class CsvParser
{
    //a constant
    const error_character = '\\uFFFD';

    /**
     *  readed  csvFile represented ar associated array
     *  @var    array
     *  @access private
     *  @author Michael Martin martin@informatik.uni-leipzig.de
     */
    private $csvMap;

    /**
     * This is the constructor.It try to open the csv file.The method throws an exception
     * on failure.
     *
     * @access  public
     * @param   str $fileName The csv file.
     * @author  Michael Martin martin@informatik.uni-leipzig.de
     *
     * @throws Exception
     */
    public function __construct($fileName = "", $separator = ',', $useHeaders = false )
    {
        //preventing some limitations
        ini_set("max_execution_time", "600");
        ini_set("memory_limit", "1536M");
        ini_set("auto_detect_line_endings", true);

        //initialising some class attributes
        $this->csvMap = array();

        //parse Map and check status
        $this->csvMap = $this->readCSV($fileName, $separator, $useHeaders);
        if(empty($this->csvMap) ) {
            throw new Exception('The file "'.$fileName.'" cannot be readed or is empty.');
        }
    }


    /**
     * Getter of the CSV Map
     *
     * @access public
     * @return array $csvMap.
     */
    public function getParsedFile ()
    {
        return $this->csvMap;
    }

//
// Private Functions
//

    /**
     * It try to open the csv file.The method throws an exception
     *
     * @access private
     * @param str $fileName The csv file.
     */
    private function readCSV ($fileName, $separator = ",", $useHeaders = false)
    {
        $csvReader = new File_CSV_IteratorReader($fileName, $separator);
        return $csvReader->toArray($useHeaders);
    }

//
// TODO: Maybe these following function could be used in further workflows
//

    // Replaces all byte sequences that need escaping. Characters that can
    // remain unencoded in N-Triples are not touched by the regex. The
    // replaced sequences are:
    //
    // 0x00-0x1F   non-printable characters
    // 0x22        double quote (")
    // 0x5C        backslash (\)
    // 0x7F        non-printable character (Control)
    // 0x80-0xBF   unexpected continuation byte,
    // 0xC0-0xFF   first byte of multi-byte character,
    //             followed by one or more continuation byte (0x80-0xBF)
    //
    // The regex accepts multi-byte sequences that don't have the correct
    // number of continuation bytes (0x80-0xBF). This is handled by the
    // callback.
    private function escape( $str )
    {
        return preg_replace_callback(
            "/[\\x00-\\x1F\\x22\\x5C\\x7F]|[\\x80-\\xBF]|[\\xC0-\\xFF][\\x80-\\xBF]*/",
            array('Transformer','escape_callback'),
            $str
        );
    }

    private static function escape_callback($matches)
    {
        $encoded_character = $matches[0];
        $byte = ord($encoded_character[0]);
        // Single-byte characters (0xxxxxxx, hex 00-7E)
        if ($byte == 0x09) {
            return "\\t";
        }
        if ($byte == 0x0A) {
            return "\\n";
        }
        if ($byte == 0x0D) {
            return "\\r";
        }
        if ($byte == 0x22) {
            return "\\\"";
        }
        if ($byte == 0x5C) {
            return "\\\\";
        }
        if ($byte < 0x20 || $byte == 0x7F) {
            // encode as \u00XX
            return "\\u00" . sprintf("%02X", $byte);
        }

        // Multi-byte characters
        if ($byte < 0xC0) {
            // Continuation bytes (0x80-0xBF) are not allowed to appear as first byte
            return Transformer::error_character;
        }
        if ($byte < 0xE0) { // 110xxxxx, hex C0-DF
            $bytes = 2;
            $codepoint = $byte & 0x1F;
        } else if ($byte < 0xF0) {
            // 1110xxxx, hex E0-EF
            $bytes = 3;
            $codepoint = $byte & 0x0F;
        } else if ($byte < 0xF8) {
            // 11110xxx, hex F0-F7
            $bytes = 4;
            $codepoint = $byte & 0x07;
        } else if ($byte < 0xFC) {
            // 111110xx, hex F8-FB
            $bytes = 5;
            $codepoint = $byte & 0x03;
        } else if ($byte < 0xFE) {
            // 1111110x, hex FC-FD
            $bytes = 6;
            $codepoint = $byte & 0x01;
        } else {
            // 11111110 and 11111111, hex FE-FF, are not allowed
            return Transformer::error_character;
        }

        // Verify correct number of continuation bytes (0x80 to 0xBF)
        $length = strlen($encoded_character);
        if ($length < $bytes) {
            // not enough continuation bytes
            return Transformer::error_character;
        }

        if ($length > $bytes) {
            // Too many continuation bytes -- show each as one error
            $rest = str_repeat(Transformer::error_character, $length - $bytes);
        } else {
            $rest = '';
        }

        // Calculate Unicode codepoints from the bytes
        for ($i = 1; $i < $bytes; $i++) {
            // Loop over the additional bytes (0x80-0xBF, 10xxxxxx)
            // Add their lowest six bits to the end of the codepoint
            $byte = ord($encoded_character[$i]);
            $codepoint = ($codepoint << 6) | ($byte & 0x3F);
        }

        // Check for overlong encoding (character is encoded as more bytes than
        // necessary, this must be rejected by a safe UTF-8 decoder)
        if (($bytes == 2 && $codepoint <= 0x7F) ||
            ($bytes == 3 && $codepoint <= 0x7FF) ||
            ($bytes == 4 && $codepoint <= 0xFFFF) ||
            ($bytes == 5 && $codepoint <= 0x1FFFFF) ||
            ($bytes == 6 && $codepoint <= 0x3FFFFF)
        ) {
            return Transformer::error_character . $rest;
        }

        // Check for UTF-16 surrogates, which must not be used in UTF-8
        if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
            return Transformer::error_character . $rest;
        }

        // Misc. illegal code positions
        if ($codepoint == 0xFFFE || $codepoint == 0xFFFF) {
            return Transformer::error_character . $rest;
        }

        if ($codepoint <= 0xFFFF) {
            // 0x0100-0xFFFF, encode as \uXXXX
            return "\\u" . sprintf("%04X", $codepoint) . $rest;
        }

        if ($codepoint <= 0x10FFFF) {
            // 0x10000-0x10FFFF, encode as \UXXXXXXXX
            return "\\U" . sprintf("%08X", $codepoint) . $rest;
        }
        // Unicode codepoint above 0x10FFFF, no characters have been assigned
        // to those codepoints
        return Transformer::error_character . $rest;
    }
}