#!/bin/bash
#
# This script leverages BaseX as an XQuery command line processor
# by using multiple small disposable BaseX databases, and parallelizing the entire processing.
# It will essentially run XQuery in batches on large data sets, and produce
# SQL insert statements, so the data can be imported into PostgreSQL.
#
# We're packing files for processing, and we're trying to balance them out in sets
# such that two constraints are met:
# - no more than 100 files per set
# - no more than 100*90k bytes per set
#
# Timing:
#
# On system1 the following times were recorded:
# If run with -j4 it does 200 thread pages in 10 seconds.
# And apparently there's about 5 posts on average per thread page.
# so in 85000 seconds (which is almost a day).
# So in a day, it would process ~1.7M posts (in 340k forum thread pages)
# and have them prepared to be imported in PostgreSQL.
# Again, for -j4, the observed peak memory usage was 500MB.
#
# Notes:
#
# 1)
# The following error(found through strace) would manifest itself because
# of GNU Parallel mainly:
# --- stopped by SIGTTOU --- 
# It's also described here: 
# https://notmuchmail.org/pipermail/notmuch/2019/028015.html
# It can be circumvented throuhg the use of script
# (script - make typescript of terminal session)
#
# 2) --linebuffer is used for GNU Parallel so it can write to stdout as
# soon as possible.
#
#
# system1 config:
# - BaseX 9.2.4
# - script (from util-linux 2.31.1)
# - GNU Parallel 20161222
# - Ubuntu 18.04 LTS
# 
# system1 hardware:
# - cpu: Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz
# - memory: 16GB DDR3 RAM, 2 x Kingston @ 1333 MT/s
# - disk: WDC WD30EURS-73TLHY0 @ 5400-7200RPM 
#
# system2 config:
# - BaseX 9.2.4
# - GNU Parallel 20181222
# - script (from util-linux 2.34)
#
# system2 hardware:
# - cpu: Intel(R) Celeron(R) CPU  J1900  @ 1.99GHz  (4 cores)
# - memory: 4GB RAM DDR @ 1600MHz
# - disk: HDD ST3000VN007-2E4166 @ 5900 rpm
#
#

BASEX="$HOME/basex-preloaded-forums/bin/basex"
mkdir meta-process
echo "Partitioning files into different sets ..."
#fpart -f 100 -s $((100 * 90000)) `pwd`/threads/ -o meta-process/files-shard

proc() {
    s="$1"
    f="$2"
    j="$3"

    echo "$s -- $j -- $f"

    # Build script to create temp db, and import all the html files
    SHARD_IMPORT_SCRIPT=$(pwd)"/tmp/import.script.$s"
    SHARD_PROCESS_SCRIPT=$(pwd)"/tmp/process.script.$s"
    SHARD_SQL_OUT=$(pwd)"/tmp/partial.$j.sql"

    cat << EOF > $SHARD_IMPORT_SCRIPT
DROP   DB tmp-$s
CREATE DB tmp-$s
SET PARSER html
SET HTMLPARSER method=xml,nons=true,nocdata=true,nodefaults=true,nobogons=true,nocolons=true,ignorable=true
SET CREATEFILTER *.html
EOF
    cat $f | perl -pne 's{^}{ADD }g;' >> $SHARD_IMPORT_SCRIPT ;
 
script --return -c "$BASEX < $SHARD_IMPORT_SCRIPT >/dev/null ; echo 'Importing Done'; "
 
    # Build processing script, to pull values and build SQL queries
    echo "for \$x in db:open(\"tmp-$s\")" > $SHARD_PROCESS_SCRIPT
    cat << 'EOF' >> $SHARD_PROCESS_SCRIPT
    let $doc_uri:=$x/fn:base-uri()
    let $elems:=$x//table[matches(@id,"post\d+")]
    for $elem in $elems
    let $date:=$elem/tr[1]/td[1]/text()
    let $author:=$elem/tr[2]/td[1]/div[1]/a//text()
    let $post_id:=$elem/@id/string()
    let $message_elems:=$elem//*[matches(@id,"post_message_")]
    let $raw_content:=string-join($message_elems/text())
    let $content:=fn:normalize-space(
    	fn:replace(
    	fn:replace($raw_content,'["'',;]',' '),
        'Quote:Originally Posted',
        ''
        )
    )
    return out:format("INSERT INTO thread(date,author,post_id,doc_uri,content) VALUES(""%s"",""%s"",""%s"",""%s"",""%s"") ;"
    ,$date,$author,$post_id,$doc_uri,$content)
EOF
 
    echo "Running basex ..."
    script --return -c "$BASEX $SHARD_PROCESS_SCRIPT > $SHARD_SQL_OUT ; echo 'Extracting done';"
    ls -lhS $SHARD_SQL_OUT
}

export BASEX="$BASEX"
export -f proc

find meta-process/ -type f | parallel --linebuffer --no-notice -j4 proc {%} {1} "{#}" :::