#!/bin/bash # # This script leverages BaseX as an XQuery command line processor # by using multiple small disposable BaseX databases, and parallelizing the entire processing. # It will essentially run XQuery in batches on large data sets, and produce # SQL insert statements, so the data can be imported into PostgreSQL. # # We're packing files for processing, and we're trying to balance them out in sets # such that two constraints are met: # - no more than 100 files per set # - no more than 100*90k bytes per set # # Timing: # # On system1 the following times were recorded: # If run with -j4 it does 200 thread pages in 10 seconds. # And apparently there's about 5 posts on average per thread page. # so in 85000 seconds (which is almost a day). # So in a day, it would process ~1.7M posts (in 340k forum thread pages) # and have them prepared to be imported in PostgreSQL. # Again, for -j4, the observed peak memory usage was 500MB. # # Notes: # # 1) # The following error(found through strace) would manifest itself because # of GNU Parallel mainly: # --- stopped by SIGTTOU --- # It's also described here: # https://notmuchmail.org/pipermail/notmuch/2019/028015.html # It can be circumvented throuhg the use of script # (script - make typescript of terminal session) # # 2) --linebuffer is used for GNU Parallel so it can write to stdout as # soon as possible. # # # system1 config: # - BaseX 9.2.4 # - script (from util-linux 2.31.1) # - GNU Parallel 20161222 # - Ubuntu 18.04 LTS # # system1 hardware: # - cpu: Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz # - memory: 16GB DDR3 RAM, 2 x Kingston @ 1333 MT/s # - disk: WDC WD30EURS-73TLHY0 @ 5400-7200RPM # # system2 config: # - BaseX 9.2.4 # - GNU Parallel 20181222 # - script (from util-linux 2.34) # # system2 hardware: # - cpu: Intel(R) Celeron(R) CPU J1900 @ 1.99GHz (4 cores) # - memory: 4GB RAM DDR @ 1600MHz # - disk: HDD ST3000VN007-2E4166 @ 5900 rpm # # BASEX="$HOME/basex-preloaded-forums/bin/basex" mkdir meta-process echo "Partitioning files into different sets ..." #fpart -f 100 -s $((100 * 90000)) `pwd`/threads/ -o meta-process/files-shard proc() { s="$1" f="$2" j="$3" echo "$s -- $j -- $f" # Build script to create temp db, and import all the html files SHARD_IMPORT_SCRIPT=$(pwd)"/tmp/import.script.$s" SHARD_PROCESS_SCRIPT=$(pwd)"/tmp/process.script.$s" SHARD_SQL_OUT=$(pwd)"/tmp/partial.$j.sql" cat << EOF > $SHARD_IMPORT_SCRIPT DROP DB tmp-$s CREATE DB tmp-$s SET PARSER html SET HTMLPARSER method=xml,nons=true,nocdata=true,nodefaults=true,nobogons=true,nocolons=true,ignorable=true SET CREATEFILTER *.html EOF cat $f | perl -pne 's{^}{ADD }g;' >> $SHARD_IMPORT_SCRIPT ; script --return -c "$BASEX < $SHARD_IMPORT_SCRIPT >/dev/null ; echo 'Importing Done'; " # Build processing script, to pull values and build SQL queries echo "for \$x in db:open(\"tmp-$s\")" > $SHARD_PROCESS_SCRIPT cat << 'EOF' >> $SHARD_PROCESS_SCRIPT let $doc_uri:=$x/fn:base-uri() let $elems:=$x//table[matches(@id,"post\d+")] for $elem in $elems let $date:=$elem/tr[1]/td[1]/text() let $author:=$elem/tr[2]/td[1]/div[1]/a//text() let $post_id:=$elem/@id/string() let $message_elems:=$elem//*[matches(@id,"post_message_")] let $raw_content:=string-join($message_elems/text()) let $content:=fn:normalize-space( fn:replace( fn:replace($raw_content,'["'',;]',' '), 'Quote:Originally Posted', '' ) ) return out:format("INSERT INTO thread(date,author,post_id,doc_uri,content) VALUES(""%s"",""%s"",""%s"",""%s"",""%s"") ;" ,$date,$author,$post_id,$doc_uri,$content) EOF echo "Running basex ..." script --return -c "$BASEX $SHARD_PROCESS_SCRIPT > $SHARD_SQL_OUT ; echo 'Extracting done';" ls -lhS $SHARD_SQL_OUT } export BASEX="$BASEX" export -f proc find meta-process/ -type f | parallel --linebuffer --no-notice -j4 proc {%} {1} "{#}" :::