#!/bin/bash

# ╔════════════════════════════════════════════════════════════════╗
# ║              LEVO BULK REPOSITORY SCANNER                      ║
# ║                                                                ║
# ║  Scans ALL repos in a GitHub org, generates OpenAPI specs,     ║
# ║  and uploads them to the Levo.ai dashboard.                    ║
# ║                                                                ║
# ║  - Runs 4 containers in parallel (1 CPU + 2 GB each)           ║
# ║  - Pre-filters unsupported languages via GitHub API            ║
# ║  - Per-repo card output (no interleaved logs)                  ║
# ╚════════════════════════════════════════════════════════════════╝

# =================================================================
# CONFIGURATION — You MUST fill in these 4 values before running
# =================================================================

GITHUB_PAT=""                # <- Paste your GitHub PAT token here
GITHUB_ORG=""                # <- Your GitHub organization name
LEVO_AUTH_KEY=""             # <- Paste your Levo auth key here
LEVO_ORG_ID=""               # <- Your Levo organization ID (optional — leave empty if you only belong to one Levo org)

# -----------------------------------------------------------------
# OPTIONAL — Review these defaults before running
# -----------------------------------------------------------------

LEVO_BASE_URL="https://api.levo.ai"     # Use a regional API endpoint if needed (for example, India: https://api.india-1.levo.ai)
ENV_NAME="bulk-scan"                    # Environment label shown on the Levo dashboard; set this explicitly if you want a different label
DEFAULT_LANGUAGE="java"                 # Fallback language if auto-detection fails
# SCANNER_IMAGE — pinned to a specific build for reproducibility.
# To upgrade, find the new SHA tag at https://hub.docker.com/r/levoai/code-scanner/tags
SCANNER_IMAGE="levoai/code-scanner:3e0aa82"

# -----------------------------------------------------------------
# PARALLEL SCANNING — 4 containers × 1 CPU × 2 GB = fills 4 CPUs / 8 GB
# Increase if your Docker Desktop has more headroom.
# -----------------------------------------------------------------

MAX_PARALLEL=4
CONTAINER_MEM="2g"
CONTAINER_CPUS="1"

# Languages supported by Atom static analysis
SUPPORTED_LANGUAGES="java python javascript typescript c php ruby csharp"

# =================================================================
# VALIDATION
# =================================================================

if [ -z "$GITHUB_PAT" ]; then
    echo "ERROR: GITHUB_PAT is empty. Open this script and paste your GitHub PAT token."
    exit 1
fi

if [ -z "$GITHUB_ORG" ]; then
    echo "ERROR: GITHUB_ORG is empty. Open this script and set your GitHub org."
    exit 1
fi

if [ -z "$LEVO_AUTH_KEY" ]; then
    echo "ERROR: LEVO_AUTH_KEY is empty. Open this script and paste your Levo auth key."
    exit 1
fi

# LEVO_ORG_ID is OPTIONAL — only needed if you belong to multiple Levo orgs.
# When empty, the scanner CLI auto-selects your single org from LEVO_AUTH_KEY.

for TOOL in git docker curl python3; do
    if ! command -v "$TOOL" &> /dev/null; then
        echo "ERROR: '$TOOL' is not installed. Please install it first."
        exit 1
    fi
done

# macOS ships without GNU timeout; use gtimeout (brew install coreutils) if available
if ! command -v timeout &>/dev/null; then
    if command -v gtimeout &>/dev/null; then
        timeout() { gtimeout "$@"; }
    else
        echo "ERROR: 'timeout' command not found."
        echo "       On macOS, install GNU coreutils: brew install coreutils"
        echo "       which provides it as 'gtimeout'."
        exit 1
    fi
fi

if ! docker info &> /dev/null; then
    echo "ERROR: Docker is installed but not running. Start Docker Desktop and try again."
    exit 1
fi

# =================================================================
# SETUP
# =================================================================

WORK_DIR="$(pwd)"
REPOS_DIR="$WORK_DIR/repos"
RESULTS_DIR="$WORK_DIR/results"
RESULTS_TMP="$WORK_DIR/.bulk-scan-results.tmp"

mkdir -p "$REPOS_DIR"
mkdir -p "$RESULTS_DIR"
> "$RESULTS_TMP"

export LEVO_BASE_URL="$LEVO_BASE_URL"

SCAN_START_TIME=$(date '+%Y-%m-%d %H:%M:%S')
SCAN_START_EPOCH=$(date '+%s')

echo ""
echo "=========================================="
echo "  LEVO BULK REPO SCANNER"
echo "=========================================="
echo ""
echo "  Organization:  $GITHUB_ORG"
echo "  Levo Target:   $LEVO_BASE_URL"
echo "  Environment:   $ENV_NAME"
echo "  Parallelism:   $MAX_PARALLEL containers × $CONTAINER_CPUS CPU × $CONTAINER_MEM RAM"
echo "  Started:       $SCAN_START_TIME"
echo ""

# =================================================================
# FUNCTIONS
# =================================================================

detect_language() {
    local REPO_PATH="$1"

    # Wrap -name predicates in \( ... \) and use -print -quit so the first match
    # short-circuits. Without parens, find's implicit -print only binds to the
    # last -o branch and earlier markers (e.g., *.java, pom.xml) are silently ignored.
    if find "$REPO_PATH" \( -name "*.java" -o -name "pom.xml" -o -name "build.gradle" \) -print -quit 2>/dev/null | grep -q .; then
        echo "java"; return
    fi
    if find "$REPO_PATH" \( -name "*.py" -o -name "requirements.txt" -o -name "setup.py" -o -name "pyproject.toml" \) -print -quit 2>/dev/null | grep -q .; then
        echo "python"; return
    fi
    if find "$REPO_PATH" \( -name "tsconfig.json" -o -name "*.ts" \) -print -quit 2>/dev/null | grep -q .; then
        echo "typescript"; return
    fi
    if find "$REPO_PATH" \( -name "*.js" -o -name "package.json" \) -print -quit 2>/dev/null | grep -q .; then
        echo "javascript"; return
    fi
    if find "$REPO_PATH" \( -name "*.rb" -o -name "Gemfile" \) -print -quit 2>/dev/null | grep -q .; then
        echo "ruby"; return
    fi
    if find "$REPO_PATH" \( -name "*.php" -o -name "composer.json" \) -print -quit 2>/dev/null | grep -q .; then
        echo "php"; return
    fi
    if find "$REPO_PATH" \( -name "*.csproj" -o -name "*.sln" -o -name "*.cs" \) -print -quit 2>/dev/null | grep -q .; then
        echo "csharp"; return
    fi
    if find "$REPO_PATH" \( -name "*.c" -o -name "*.cpp" -o -name "*.h" \) -print -quit 2>/dev/null | grep -q .; then
        echo "c"; return
    fi

    echo "$DEFAULT_LANGUAGE"
}

is_supported_language() {
    local LANG="$1"
    for SUPPORTED in $SUPPORTED_LANGUAGES; do
        [ "$LANG" = "$SUPPORTED" ] && return 0
    done
    return 1
}

normalize_github_lang() {
    local LANG="$1"
    case "$LANG" in
        "C#")       echo "csharp" ;;
        "C++"|"C")  echo "c" ;;
        *)          echo "$LANG" | tr '[:upper:]' '[:lower:]' ;;
    esac
}

# Portable spinlock using mkdir (atomic on all POSIX filesystems; works on
# macOS + Linux + Git Bash for Windows, none of which ship flock by default).
LOCK_DIR="$WORK_DIR/.bulk-scan.lockdir"

acquire_lock() {
    until mkdir "$LOCK_DIR" 2>/dev/null; do sleep 0.05; done
}
release_lock() {
    rmdir "$LOCK_DIR" 2>/dev/null
}

# Append one line to the shared results file under a lock
locked_record() {
    acquire_lock
    echo "$1" >> "$RESULTS_TMP"
    release_lock
}

# Atomically print a buffered card (file at $1) to the terminal
flush_card() {
    acquire_lock
    cat "$1"
    release_lock
    rm -f "$1"
}

# =================================================================
# STEP 1: Fetch all repos from GitHub + pre-filter by language
# =================================================================

echo "[Step 1/3] Fetching repository list from GitHub org: $GITHUB_ORG ..."
echo ""

PAGE=1
ALL_REPOS=()
LANG_MAP_FILE=$(mktemp)

while true; do
    RESPONSE=$(curl -s -H "Authorization: token $GITHUB_PAT" \
        "https://api.github.com/orgs/$GITHUB_ORG/repos?per_page=100&page=$PAGE&type=all")

    # A successful list response is a JSON array (starts with '[').
    # An error response is a JSON object with a top-level "message" field.
    # Match the substring '"message"' would false-positive on any repo whose
    # description or topics contain the word "message", so detect via the first char.
    FIRST_CHAR=$(printf '%s' "$RESPONSE" | tr -d '[:space:]' | head -c 1)
    if [ "$FIRST_CHAR" != "[" ]; then
        ERROR_MSG=$(echo "$RESPONSE" | grep -o '"message": "[^"]*"' | head -1)
        [ -z "$ERROR_MSG" ] && ERROR_MSG="unexpected response (not a JSON array)"
        echo "ERROR: GitHub API returned: $ERROR_MSG"
        echo "  Check your GITHUB_PAT token and GITHUB_ORG name."
        rm -f "$LANG_MAP_FILE"
        exit 1
    fi

    REPOS=$(echo "$RESPONSE" | grep -o '"full_name": "[^"]*"' | sed 's/"full_name": "//;s/"//')
    [ -z "$REPOS" ] && break

    while IFS= read -r R; do
        ALL_REPOS+=("$R")
    done <<< "$REPOS"

    echo "$RESPONSE" | python3 -c "
import sys, json
try:
    for r in json.load(sys.stdin):
        print(r.get('full_name','') + ':' + (r.get('language') or 'null'))
except Exception:
    pass
" >> "$LANG_MAP_FILE"

    PAGE=$((PAGE + 1))
done

TOTAL_FOUND=${#ALL_REPOS[@]}

if [ "$TOTAL_FOUND" -eq 0 ]; then
    echo "ERROR: No repositories found. Check your PAT token and org name."
    rm -f "$LANG_MAP_FILE"
    exit 1
fi

echo "  Found $TOTAL_FOUND repositories."

# Pre-filter unsupported languages using the GitHub API's primary-language field
SCAN_LIST=()
PREFILTERED_COUNT=0
PREFILTERED_DETAILS=""

for FULL_REPO in "${ALL_REPOS[@]}"; do
    SHOULD_SKIP=false
    GH_LANG_RAW=""

    if [ -s "$LANG_MAP_FILE" ]; then
        GH_LANG_RAW=$(grep -F "${FULL_REPO}:" "$LANG_MAP_FILE" | head -1 | cut -d':' -f2)
        if [ -n "$GH_LANG_RAW" ] && [ "$GH_LANG_RAW" != "null" ]; then
            GH_LANG_NORM=$(normalize_github_lang "$GH_LANG_RAW")
            if ! is_supported_language "$GH_LANG_NORM"; then
                SHOULD_SKIP=true
            fi
        fi
    fi

    if $SHOULD_SKIP; then
        PREFILTERED_COUNT=$((PREFILTERED_COUNT + 1))
        REPO_NAME=$(echo "$FULL_REPO" | cut -d'/' -f2)
        PREFILTERED_DETAILS="${PREFILTERED_DETAILS}    - ${REPO_NAME} (${GH_LANG_RAW})"$'\n'
        locked_record "PREFILTERED|$REPO_NAME|$GH_LANG_RAW"
    else
        SCAN_LIST+=("$FULL_REPO")
    fi
done

rm -f "$LANG_MAP_FILE"

SCAN_COUNT=${#SCAN_LIST[@]}

if [ "$PREFILTERED_COUNT" -gt 0 ]; then
    echo "  Pre-filtered $PREFILTERED_COUNT repo(s) (unsupported language — not cloned)."
fi
echo "  $SCAN_COUNT repo(s) will be scanned."
echo ""

if [ "$SCAN_COUNT" -eq 0 ]; then
    echo "  Nothing to scan. Exiting."
    rm -f "$RESULTS_TMP" "$LOCK_FILE"
    exit 0
fi

# =================================================================
# WORKER — scans one repo, prints a single clean card when done
# =================================================================

scan_one_repo() {
    local FULL_REPO="$1"
    local INDEX="$2"
    local TOTAL="$3"
    local REPO_NAME
    REPO_NAME=$(echo "$FULL_REPO" | cut -d'/' -f2)

    # On Git Bash for Windows, heavy parallel forking sometimes hands a
    # backgrounded subshell a stale CWD reference. After that, every git
    # and docker call fails with "fatal: Unable to read current working
    # directory: No such file or directory". Re-anchor before doing anything.
    cd "$WORK_DIR" 2>/dev/null || true

    local OUT
    OUT=$(mktemp "${TMPDIR:-/tmp}/levo-card.XXXXXX")

    # Card header
    {
        echo ""
        echo "=========================================="
        echo " [$INDEX/$TOTAL] $FULL_REPO"
        echo "=========================================="
    } >> "$OUT"

    # -- Step 1: Clone (with one retry for transient failures) --
    # Pass the PAT via http.extraheader instead of embedding it in the URL.
    # Embedding leaks the token into git's own error messages and the process list.
    # `-c credential.helper=` disables Windows Git Credential Manager so parallel
    # clones don't contend over the credential store.
    rm -rf "$REPOS_DIR/$REPO_NAME"
    local CLONE_ERR
    CLONE_ERR=$(mktemp "${TMPDIR:-/tmp}/levo-clone.XXXXXX")
    local CLONE_OK=false
    local TRY
    local AUTH_HEADER
    AUTH_HEADER="Authorization: Basic $(printf 'x-access-token:%s' "$GITHUB_PAT" | base64 | tr -d '\n')"
    for TRY in 1 2; do
        cd "$WORK_DIR" 2>/dev/null || true
        if git -c credential.helper= -c http.extraheader="$AUTH_HEADER" \
            clone --depth 1 --quiet \
            "https://github.com/$FULL_REPO.git" \
            "$REPOS_DIR/$REPO_NAME" 2> "$CLONE_ERR"; then
            CLONE_OK=true
            break
        fi
        rm -rf "$REPOS_DIR/$REPO_NAME"
        [ $TRY -lt 2 ] && sleep 2
    done

    if ! $CLONE_OK; then
        local ERR_LINE
        ERR_LINE=$(grep -v '^$' "$CLONE_ERR" 2>/dev/null | head -1 | head -c 160)
        # Defence in depth: scrub the PAT from any output before it's printed or recorded.
        ERR_LINE="${ERR_LINE//$GITHUB_PAT/***REDACTED***}"
        [ -z "$ERR_LINE" ] && ERR_LINE="repo not accessible (empty, archived, or no PAT access)"
        {
            echo "  [Step 1/3] Cloning ... FAILED (after retry)"
            echo "             git: $ERR_LINE"
            echo ""
            echo "  Result: SKIPPED"
            echo ""
        } >> "$OUT"
        locked_record "SKIPPED|$REPO_NAME|$ERR_LINE"
        rm -f "$CLONE_ERR"
        flush_card "$OUT"
        return
    fi
    rm -f "$CLONE_ERR"
    echo "  [Step 1/3] Cloning ... done" >> "$OUT"

    # -- Step 2: Detect language --
    local LANGUAGE
    LANGUAGE=$(detect_language "$REPOS_DIR/$REPO_NAME")
    echo "  [Step 2/3] Language detected: $LANGUAGE" >> "$OUT"

    if ! is_supported_language "$LANGUAGE"; then
        {
            echo ""
            echo "  Result: SKIPPED — Levo scanner does not support $LANGUAGE"
            echo ""
            echo "  Cleaned up cloned repo."
            echo ""
        } >> "$OUT"
        locked_record "UNSUPPORTED|$REPO_NAME|$LANGUAGE"
        rm -rf "$REPOS_DIR/$REPO_NAME"
        flush_card "$OUT"
        return
    fi

    mkdir -p "$RESULTS_DIR/$REPO_NAME"

    echo "             Scanning and uploading ..." >> "$OUT"

    # Build scanner args — --organization is only added if LEVO_ORG_ID is set
    local SCANNER_ARGS=(
        --app-name "$REPO_NAME"
        --env-name "$ENV_NAME"
        --language "$LANGUAGE"
        --key "$LEVO_AUTH_KEY"
    )
    [ -n "$LEVO_ORG_ID" ] && SCANNER_ARGS+=(--organization "$LEVO_ORG_ID")

    # The scanner CLI writes openapi_spec.json to:
    #   $LEVO_WORK_DIRECTORY/scan-results/<app-name>/openapi_spec.json
    # so with LEVO_WORK_DIRECTORY=/workspace mounted to $REPOS_DIR/$REPO_NAME,
    # the spec lands on the host at $REPOS_DIR/$REPO_NAME/scan-results/$REPO_NAME/openapi_spec.json.
    # Apply MSYS_NO_PATHCONV=1 to docker itself (via env), not to the timeout wrapper,
    # so Git Bash on Windows does not translate the -v and -w paths.
    timeout 30m env MSYS_NO_PATHCONV=1 docker run --rm \
        --memory="$CONTAINER_MEM" \
        --cpus="$CONTAINER_CPUS" \
        -v "$REPOS_DIR/$REPO_NAME:/workspace" \
        -e LEVO_BASE_URL="$LEVO_BASE_URL" \
        -e LEVO_WORK_DIRECTORY=/workspace \
        -w /workspace \
        "$SCANNER_IMAGE" \
        "${SCANNER_ARGS[@]}" \
        > "$RESULTS_DIR/$REPO_NAME/scan-output.txt" 2>&1

    local EXIT_CODE=$?

    cp "$REPOS_DIR/$REPO_NAME/scan-results/$REPO_NAME/openapi_spec.json" \
        "$RESULTS_DIR/$REPO_NAME/" 2>/dev/null
    cp "$REPOS_DIR/$REPO_NAME/usages.json" "$RESULTS_DIR/$REPO_NAME/" 2>/dev/null

    # -- Step 3: Result --
    if [ $EXIT_CODE -eq 124 ]; then
        {
            echo ""
            echo "  [Step 3/3] Result: FAILED — scan timed out after 30 minutes (repo too large)"
            echo "             It seems like a big repository. Try to scan it separately by following the docs."
        } >> "$OUT"
        locked_record "FAILED|$REPO_NAME|$LANGUAGE|Timed out after 30 minutes"
    elif [ $EXIT_CODE -eq 0 ]; then
        {
            echo ""
            echo "  [Step 3/3] Result: SUCCESS — spec uploaded to Levo"
        } >> "$OUT"
        locked_record "SUCCESS|$REPO_NAME|$LANGUAGE"
    else
        local LAST_LINE
        LAST_LINE=$(tail -1 "$RESULTS_DIR/$REPO_NAME/scan-output.txt" 2>/dev/null | head -c 100)
        [ -z "$LAST_LINE" ] && LAST_LINE="exit code $EXIT_CODE"
        {
            echo ""
            echo "  [Step 3/3] Result: FAILED — $LAST_LINE"
        } >> "$OUT"
        locked_record "FAILED|$REPO_NAME|$LANGUAGE|$LAST_LINE"
    fi

    rm -rf "$REPOS_DIR/$REPO_NAME"
    {
        echo ""
        echo "  Cleaned up cloned repo."
        echo ""
    } >> "$OUT"

    flush_card "$OUT"
}

# =================================================================
# STEP 2: Parallel scan
# =================================================================

echo "[Step 2/3] Scanning $SCAN_COUNT repositories ($MAX_PARALLEL in parallel) ..."

declare -a RUNNING_PIDS=()
INDEX=0

for FULL_REPO in "${SCAN_LIST[@]}"; do
    INDEX=$((INDEX + 1))

    # Wait for a free slot before launching the next worker.
    # Avoid `wait -n` — it requires bash 4.3+ and macOS ships bash 3.2.
    while [ ${#RUNNING_PIDS[@]} -ge "$MAX_PARALLEL" ]; do
        NEW_PIDS=()
        for PID in "${RUNNING_PIDS[@]}"; do
            kill -0 "$PID" 2>/dev/null && NEW_PIDS+=("$PID")
        done
        RUNNING_PIDS=("${NEW_PIDS[@]}")
        [ ${#RUNNING_PIDS[@]} -ge "$MAX_PARALLEL" ] && sleep 0.5
    done

    scan_one_repo "$FULL_REPO" "$INDEX" "$SCAN_COUNT" &
    RUNNING_PIDS+=("$!")
done

# Wait for the final batch
wait

# =================================================================
# STEP 3: Summary
# =================================================================

SUCCESS_COUNT=0
FAILED_COUNT=0
SKIPPED_COUNT=0
UNSUPPORTED_COUNT=0

while IFS='|' read -r STATUS REST; do
    case "$STATUS" in
        SUCCESS)     SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) ;;
        FAILED)      FAILED_COUNT=$((FAILED_COUNT + 1)) ;;
        SKIPPED)     SKIPPED_COUNT=$((SKIPPED_COUNT + 1)) ;;
        UNSUPPORTED) UNSUPPORTED_COUNT=$((UNSUPPORTED_COUNT + 1)) ;;
    esac
done < "$RESULTS_TMP"

SCAN_END_TIME=$(date '+%Y-%m-%d %H:%M:%S')
SCAN_END_EPOCH=$(date '+%s')
ELAPSED=$((SCAN_END_EPOCH - SCAN_START_EPOCH))
ELAPSED_MIN=$((ELAPSED / 60))
ELAPSED_SEC=$((ELAPSED % 60))

echo ""
echo "[Step 3/3] Summary"
echo ""
echo "=========================================="
echo "          BULK SCAN COMPLETE"
echo "=========================================="
echo ""
echo "  Total repositories found:            $TOTAL_FOUND"
echo "  Pre-filtered (unsupported language): $PREFILTERED_COUNT"
echo "  Scanned successfully:                $SUCCESS_COUNT"
echo "  Scan failed:                         $FAILED_COUNT"
echo "  Unsupported (post-clone detect):     $UNSUPPORTED_COUNT"
echo "  Not accessible:                      $SKIPPED_COUNT"
echo ""
echo "  Started:   $SCAN_START_TIME"
echo "  Finished:  $SCAN_END_TIME"
echo "  Duration:  ${ELAPSED_MIN}m ${ELAPSED_SEC}s"
echo ""
echo "  Per-repo output saved to: $RESULTS_DIR/<repo>/"
echo "  Check your Levo dashboard for uploaded specs."
echo "=========================================="
echo ""

# Cleanup temp files
rm -f "$RESULTS_TMP" "$LOCK_FILE"
rmdir "$LOCK_DIR" 2>/dev/null || true