#!/bin/bash
set -u

# The NSYS_CUDAGPUTRACE_NAME_WIDTH env var can be used to set the width of
# the Name column in non-CSV outputs.
#
# EXAMPLE: This will clip the column width to 80 charaters:
# $ export NSYS_CUDAGPUTRACE_NAME_WIDTH=80
#
# Unfortunatly, due to the way the SQLite shell works, 333 is the maximum
# usable column width (1000/3).  Even if the value is set higher, values
# will be clipped before being displayed.
NAMEWIDTH=${NSYS_CUDAGPUTRACE_NAME_WIDTH:-333}

function table_exists()
{
    if [ -z "${TABLES[*]}" ]
    then
        readarray -t TABLES <<< $(${SQLITE3} "$DATABASE" "SELECT name FROM sqlite_master WHERE type = 'table'")
    fi

    for TABLE in "${TABLES[@]}"
    do
        if [ "${TABLE}" = "$1" ]
        then
            echo "true"
            return 1
        fi
    done
    echo "false"
    return 0
}

DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
SQLITE3="${DIR}"/sqlite3
SQLITE3OPTS= #"-echo"

# check number of arguments
if [ $# -lt 1 ]||[ $# -gt 2 ]; then
    echo " usage: ${BASH_SOURCE[0]} <SQLite_file> <format>"
    echo
    echo "    SQLite_file: An SQLite database file exported from an NSys QDREP file"
    echo "    format:      The desired output format: nvprof, nvprofcsv, or pretty"
    echo
    echo "    WARNING: The \"pretty\" output format is be subject to incompatible changes."
    echo
    exit 1
fi

DATABASE="$1"
FORMAT=${2:-'UNKNOWN'}

# check if file exists
if [ ! -f "$DATABASE" ]
then
    echo "$DATABASE file not found. Exiting"
    exit 1
fi

# check if file opened is DB; if not, exit.
# The sqlite3 file format is defined at https://www.sqlite.org/draft/fileformat.html
HEADER=$(head -c 16 "$DATABASE" | tr '\0' '\n')
if [ "$HEADER" != "SQLite format 3" ]
then
    echo "$DATABASE is not an SQLite DB file. Exiting."
    exit 1
fi

if ! $(table_exists CUPTI_ACTIVITY_KIND_KERNEL)
then
    echo "$DATABASE does not contain the CUDA GPU activity. Exiting."
    exit 2
fi

if ! $(table_exists TARGET_INFO_CUDA_GPU)
then
    echo "$DATABASE does not contain information about the system's GPU(s). Exiting."
    exit 2
fi

SQL_MEMCPY_UNION=''
if $(table_exists CUPTI_ACTIVITY_KIND_MEMCPY)
then
    read -d '' SQL_MEMCPY_UNION << EOF
UNION ALL

SELECT
    start AS "StartNanosec",
    (end - start) AS "DurNanosec",
    NULL AS "Grid",
    NULL AS "GridX",
    NULL AS "GridY",
    NULL AS "GridZ",
    NULL AS "Block",
    NULL AS "BlockX",
    NULL AS "BlockY",
    NULL AS "BlockZ",
    NULL AS "RegsPerThread",
    NULL AS "SSMem",
    NULL AS "DSMem",
    bytes AS "SizeBytes",
    (bytes * 1000000000.0) / CAST(end - start AS REAL) AS "ThroughputBPS",
    msrck.name AS "SrcMemKind",
    mdstk.name AS "DstMemKind",
    NULL AS "MemsetValue",
    printf('%s (%d)', gpu.name, deviceId) AS "Device",
    contextId AS "Context",
    streamId AS "Stream",
    memopstr.name AS "Name",
    correlationId AS "CorID"
FROM
    CUPTI_ACTIVITY_KIND_MEMCPY AS memcpy
LEFT JOIN
    MemcpyOperStrings AS memopstr
    ON memcpy.copyKind = memopstr.id
LEFT JOIN
    MemKindStrings AS msrck
    ON memcpy.srcKind = msrck.id
LEFT JOIN
    MemKindStrings AS mdstk
    ON memcpy.dstKind = mdstk.id
LEFT JOIN
    TARGET_INFO_CUDA_GPU AS gpu
    USING( deviceId )
EOF
fi


SQL_MEMSET_UNION=''
if $(table_exists CUPTI_ACTIVITY_KIND_MEMSET)
then
    read -d '' SQL_MEMSET_UNION << EOF
UNION ALL

SELECT
    start AS "StartNanosec",
    (end - start) AS "DurNanosec",
    NULL AS "Grid",
    NULL AS "GridX",
    NULL AS "GridY",
    NULL AS "GridZ",
    NULL AS "Block",
    NULL AS "BlockX",
    NULL AS "BlockY",
    NULL AS "BlockZ",
    NULL AS "RegsPerThread",
    NULL AS "SSMem",
    NULL AS "DSMem",
    bytes AS "SizeBytes",
    (bytes * 1000000000.0) / CAST(end - start AS REAL) AS "ThroughputBPS",
    mk.name AS "SrcMemKind",
    NULL AS "DstMemKind",
    value AS "MemsetValue",
    printf('%s (%d)', gpu.name, deviceId) AS "Device",
    contextId AS "Context",
    streamId AS "Stream",
    '[CUDA memset]' AS "Name",
    correlationId AS "CorID"
FROM
    CUPTI_ACTIVITY_KIND_MEMSET AS memset
LEFT JOIN
    MemKindStrings AS mk
    ON memset.memKind = mk.id
LEFT JOIN
    TARGET_INFO_CUDA_GPU AS gpu
    USING( deviceId )
EOF
fi

# Build common helper tables and trace view

"${SQLITE3}" ${SQLITE3OPTS} "${DATABASE}" << EOF
BEGIN;

CREATE TABLE IF NOT EXISTS MemKindStrings (
    id      INTEGER  NOT NULL  PRIMARY KEY,
    name    TEXT);
INSERT OR IGNORE INTO MemKindStrings (id, name) VALUES
    (0,     'Pageable'),
    (1,     'Pinned'),
    (2,     'Device'),
    (3,     'Array'),
    (4,     'Unknown');

CREATE TABLE IF NOT EXISTS MemcpyOperStrings (
    id      INTEGER  NOT NULL  PRIMARY KEY,
    name    TEXT);
INSERT OR IGNORE INTO MemcpyOperStrings (id, name) VALUES
    (0,     '[CUDA memcpy Unknown]'),
    (1,     '[CUDA memcpy HtoD]'),
    (2,     '[CUDA memcpy DtoH]'),
    (3,     '[CUDA memcpy HtoA]'),
    (4,     '[CUDA memcpy AtoH]'),
    (5,     '[CUDA memcpy AtoA]'),
    (6,     '[CUDA memcpy AtoD]'),
    (7,     '[CUDA memcpy DtoA]'),
    (8,     '[CUDA memcpy DtoD]'),
    (9,     '[CUDA memcpy HtoH]'),
    (10,    '[CUDA memcpy PtoP]'),
    (11,    '[CUDA Unified Memory memcpy HtoD]'),
    (12,    '[CUDA Unified Memory memcpy DtoH]'),
    (13,    '[CUDA Unified Memory memcpy DtoD]');

DROP VIEW IF EXISTS GpuTraceView;
CREATE VIEW IF NOT EXISTS GpuTraceView AS
    SELECT
        start AS "StartNanosec",
        (end - start) AS "DurNanosec",
        printf('(%d %d %d)', gridX, gridY, gridZ) AS "Grid",
        gridX AS "GridX",
        gridY AS "GridY",
        gridZ AS "GridZ",
        printf('(%d %d %d)', blockX, blockY, blockZ) AS "Block",
        blockX AS "BlockX",
        blockY AS "BlockY",
        blockZ AS "BlockZ",
        registersPerThread AS "RegsPerThread",
        staticSharedMemory AS "SSMemBytes",
        dynamicSharedMemory AS "DSMemBytes",
        NULL AS "SizeBytes",
        NULL AS "ThroughputBPS",
        NULL AS "SrcMemKind",
        NULL AS "DstMemKind",
        NULL AS "MemsetValue",
        printf('%s (%d)', gpu.name, deviceId) AS "Device",
        contextId AS "Context",
        streamId AS "Stream",
        dmn.value AS "Name",
        correlationId AS "CorID"
    FROM
        CUPTI_ACTIVITY_KIND_KERNEL
    LEFT JOIN
        StringIds AS dmn
        ON CUPTI_ACTIVITY_KIND_KERNEL.demangledName = dmn.id
    LEFT JOIN
        TARGET_INFO_CUDA_GPU AS gpu
        USING( deviceId )

    ${SQL_MEMCPY_UNION}
    ${SQL_MEMSET_UNION}
    ;

COMMIT;
EOF

if [ $FORMAT == 'nvprof' ]
then

"${SQLITE3}" ${SQLITE3OPTS} "${DATABASE}" << EOF
.header on
.mode column
.width -8 -8 -19 -14 -8 -8 -8 -8 -10 -10 -10 -15 -8 -8 ${NAMEWIDTH}

SELECT
    CASE
    WHEN StartNanosec > 1000000000 THEN
        printf('%.7ss', printf('%7f', StartNanosec / 1000000000.0))
    WHEN StartNanosec > 1000000 THEN
        printf('%.6sms', printf('%7f', StartNanosec / 1000000.0))
    WHEN StartNanosec > 1000 THEN
        printf('%.6sus', printf('%7f', StartNanosec / 1000.0))
    ELSE
        printf('%.6sns', printf('%7f', StartNanosec / 1.0))
    END AS "Start",
    CASE
    WHEN DurNanosec > 1000000000 THEN
        printf('%.7ss', printf('%7f', DurNanosec / 1000000000.0))
    WHEN DurNanosec > 1000000 THEN
        printf('%.6sms', printf('%7f', DurNanosec / 1000000.0))
    WHEN DurNanosec > 1000 THEN
        printf('%.6sus', printf('%7f', DurNanosec / 1000.0))
    ELSE
        printf('%.6sns', printf('%7f', DurNanosec / 1.0))
    END AS "Duration",
    coalesce(Grid, '-') AS "Grid Size",
    coalesce(Block, '-') AS "Block Size",
    coalesce(RegsPerThread, '-') AS "Regs*",
    CASE
    WHEN SSMemBytes IS NULL THEN '-'
    WHEN SSMemBytes > (1024 * 1024 * 1024) THEN
        printf('%.6sGB', printf('%7f', CAST(SSMeMBytes AS REAL) / (1024 * 1024 * 1024)))
    WHEN SSMemBytes > (1024 * 1024) THEN
        printf('%.6sMB', printf('%7f', CAST(SSMeMBytes AS REAL) / (1024 * 1024)))
    WHEN SSMemBytes > (1024) THEN
        printf('%.6sKB', printf('%7f', CAST(SSMeMBytes AS REAL) / (1024)))
    ELSE
        printf('%.7sB', printf('%d', SSMeMBytes))
    END AS "SSMem*",
    CASE
    WHEN DSMemBytes IS NULL THEN '-'
    WHEN DSMemBytes > (1024 * 1024 * 1024) THEN
        printf('%.6sGB', printf('%7f', CAST(DSMemBytes AS REAL) / (1024 * 1024 * 1024)))
    WHEN DSMemBytes > (1024 * 1024) THEN
        printf('%.6sMB', printf('%7f', CAST(DSMemBytes AS REAL) / (1024 * 1024)))
    WHEN DSMemBytes > (1024) THEN
        printf('%.6sKB', printf('%7f', CAST(DSMemBytes AS REAL) / (1024)))
    ELSE
        printf('%.7sB', printf('%d', DSMemBytes))
    END AS "DSMem*",
    CASE
    WHEN SizeBytes IS NULL THEN '-'
    WHEN SizeBytes > (1024 * 1024 * 1024) THEN
        printf('%.6sGB', printf('%7f', CAST(SizeBytes AS REAL) / (1024 * 1024 * 1024)))
    WHEN SizeBytes > (1024 * 1024) THEN
        printf('%.6sMB', printf('%7f', CAST(SizeBytes AS REAL) / (1024 * 1024)))
    WHEN SizeBytes > (1024) THEN
        printf('%.6sKB', printf('%7f', CAST(SizeBytes AS REAL) / (1024)))
    ELSE
        printf('%.7sB', printf('%d', SizeBytes))
    END AS "Size",
    CASE
    WHEN ThroughputBPS IS NULL THEN '-'
    WHEN ThroughputBPS > (1024 * 1024 * 1024) THEN
        printf('%.6sGB/s', printf('%7f', CAST(ThroughputBPS AS REAL) / (1024 * 1024 * 1024)))
    WHEN ThroughputBPS > (1024 * 1024) THEN
        printf('%.6sMB/s', printf('%7f', CAST(ThroughputBPS AS REAL) / (1024 * 1024)))
    WHEN ThroughputBPS > (1024) THEN
        printf('%.6sKB/s', printf('%7f', CAST(ThroughputBPS AS REAL) / (1024)))
    ELSE
        printf('%.7sB/s', printf('%d', ThroughputBPS))
    END AS "Throughput",
    coalesce(SrcMemKind, '-') AS "SrcMemType",
    coalesce(DstMemKind, '-') AS "DstMemType",
    Device AS "Device",
    Context AS "Context",
    Stream AS "Stream",
    CASE
    WHEN SizeBytes IS NULL THEN
        -- CUDA Kernel event
        Name || printf(' [%d]', CorID)
    ELSE
        -- MEMCPY/MEMSET event (Does not have Correlation ID)
        Name
    END AS "Name"
FROM
    GpuTraceView
ORDER BY StartNanosec;

.print ""
.print "Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows."
.print "SSMem: Static shared memory allocated per CUDA block."
.print "DSMem: Dynamic shared memory allocated per CUDA block."
.print "SrcMemType: The type of source memory accessed by memory operation/copy"
.print "DstMemType: The type of destination memory accessed by memory operation/copy"
.print ""

EOF


elif [ $FORMAT == 'nvprofcsv' ]
then

# This query depends on the ORDER BY of the sub-SELECT being preserved through
# the outer SELECT.  The outer SELECT is only used to strip off the
# StartNanosec column.  All of this is done to inject the units row as the
# second row and preserve the Nvprof CSV format.

"${SQLITE3}" ${SQLITE3OPTS} "${DATABASE}" << EOF
.header on
.mode csv

SELECT
    Start AS "Start",
    Duration AS "Duration",
    GridX AS "Grid X",
    GridY AS "Grid Y",
    GridZ AS "Grid Z",
    BlockX AS "Block X",
    BlockY AS "Block Y",
    BlockZ AS "Block Z",
    RegsPerThread AS "Registers Per Thread",
    SSMemBytes AS "Static SMem",
    DSMemBytes AS "Dynamic SMem",
    SizeBytes AS "Size",
    Throughput AS "Throughput",
    SrcMemKind AS "SrcMemType",
    DstMemKind AS "DstMemType",
    Device AS "Device",
    Context AS "Context",
    Stream AS "Stream",
    Name AS "Name",
    Correlation_ID AS "Correlation_ID"

FROM (
    SELECT -- adds the units line in the original nvprof output
    -1 AS "StartNanosec",
    's' AS "Start",
    'us' AS "Duration",
    NULL AS "GridX",
    NULL AS "GridY",
    NULL AS "GridZ",
    NULL AS "BlockX",
    NULL AS "BlockY",
    NULL AS "BlockZ",
    NULL AS "RegsPerThread",
    'KB' AS "SSMemBytes",
    'B' AS "DSMemBytes",
    'MB' AS "SizeBytes", -- in MB
    'GB/s' AS "Throughput",
    NULL AS "SrcMemKind",
    NULL AS "DstMemKind",
    NULL AS "Device",
    NULL AS "Context",
    NULL AS "Stream",
    NULL AS "Name",
    NULL AS "Correlation_ID"

    UNION ALL

    SELECT
    StartNanosec AS "StartNanosec",
    printf('%8.6f', StartNanosec / 1000000000.0 ) AS "Start", -- in sec
    printf('%8.6f', DurNanosec / 1000.0 ) AS "Duration", -- in microsec (us)
    GridX AS "GridX",
    GridY AS "GridY",
    GridZ AS "GridZ",
    BlockX AS "BlockX",
    BlockY AS "BlockY",
    BlockZ AS "BlockZ",
    RegsPerThread AS "RegsPerThread",
    CASE
        WHEN SSMemBytes IS NULL THEN NULL
        ELSE printf('%8f', SSMemBytes / 1024.0) -- in KB
    END AS "SSMemBytes",
    DSMemBytes AS "DSMemBytes", -- in bytes
    CASE
        WHEN SizeBytes IS NULL THEN NULL
        ELSE printf('%.6f', SizeBytes / (1024.0 * 1024.0)) -- in MB
    END AS "SizeBytes",
    CASE
        WHEN SizeBytes IS NULL THEN NULL
        ELSE printf('%8.6f', CAST(SizeBytes AS REAL) / DurNanosec) -- in GB/s
    END AS "Throughput",
    SrcMemKind AS "SrcMemKind",
    DstMemKind AS "DstMemKind",
    Device AS "Device",
    Context AS "Context",
    Stream AS "Stream",
    Name AS "Name",
    CorID AS "Correlation_ID"
    FROM
        GpuTraceView

    ORDER BY StartNanosec
);

EOF

elif [ $FORMAT == 'pretty' ]
then

"${SQLITE3}" ${SQLITE3OPTS} "${DATABASE}" << EOF
.header on
.mode column
.width -10 -9 -11 -4 -4 -4 -4 -4 -4 -7 -7 -7 -14 -11 -8 -8 0 -4 -4 ${NAMEWIDTH}

SELECT
    printf('%.6f', StartNanosec / 1000000000.0 ) AS "Start(sec)",
    printf('%.3f', DurNanosec / 1000.0 ) AS "Dur(usec)",
    CorID AS "Correlation",
    coalesce(GridX, '-') AS "GrdX",
    coalesce(GridY, '-') AS "GrdY",
    coalesce(GridZ, '-') AS "GrdZ",
    coalesce(BlockX, '-') AS "BlkX",
    coalesce(BlockY, '-') AS "BlkY",
    coalesce(BlockZ, '-') AS "BlkZ",
    coalesce(RegsPerThread, '-') AS "Reg/Trd",
    coalesce(SSMemBytes, '-') AS "StcSMem",
    coalesce(DSMemBytes, '-') AS "DymSMem",
    CASE
    WHEN SizeBytes IS NULL THEN '-'
    ELSE printf('%,d', SizeBytes)
    END AS "Bytes",
    CASE
    WHEN SizeBytes IS NULL THEN '-'
    ELSE printf('%.3f', (SizeBytes * 1000.0) / DurNanosec)
    END AS "Thru(MiB/s)",
    coalesce(SrcMemKind, '-') AS "SrcMemKd",
    coalesce(DstMemKind, '-') AS "DstMemKd",
    Device AS "Device",
    Context AS "Ctx",
    Stream AS "Strm",
    Name AS "Name"
FROM
        GpuTraceView
ORDER BY StartNanosec;

EOF

else
    echo ' Unknown export format: Please use "nvprof", "nvprofcsv", or "pretty"'
    exit 1
fi

exit
