main content

2025-10-12 12:07:08 +02:00 · 2025-10-12 12:07:08 +02:00 · 05b6d9a40c
commit 05b6d9a40c
parent 2c670b99c7
8 changed files with 292 additions and 0 deletions
--- a/a.num
+++ b/a.num
@ -0,0 +1,3 @@
+n	64	100	1024	4096
+k1	0.5	0.3	0.42	0.88
+k2	0.8	0.2	0.76	0.62
--- a/b.num
+++ b/b.num
@ -0,0 +1,3 @@
+n	64	100	1024	4096
+k1	0,5	0,3	0,42	0,88
+k2	0,8	0,2	0,76	0,62
--- a/combinator.inc
+++ b/combinator.inc
@ -0,0 +1,68 @@
+# Just . combinator.inc or source combinator.inc
+# After that,
+echo "You can (manually):"
+echo "- header data.csv"
+echo "- fill data.csv magyar rand"
+echo "Or create comparison for a specific data kind:"
+echo "- genfor rand data.csv"
+echo "Or just a big default mess:"
+echo "- generate data.csv"
+echo "To cleanup data for libreoffice calc (hungarian one that is):"
+echo "- cleanup data.csv"
+echo ""
+echo "The generate gives a 'default set' that you can add your missing stuff with further 'fill' commands if needed"
+
+basefile=5000000.txt
+
+declare -a definputs=("worst" "smallrange" "rand" "constant")
+declare -a sortalgs=(`awk '/worst/{getline; last=1} last{for(x=1;x<=NF;++x) print $x}' ORS=' ' $basefile`)
+
+# header data.csv
+header() {
+	outfile="$1"
+
+	echo -n 'alg	' > "$outfile"
+	awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' ORS='\t' `ls -tr --time=birth *.txt` | sed 's/,$//' >> "$outfile"
+	echo "" >> "$outfile"
+}
+
+# fill data.csv magyar rand
+fill() {
+	outfile="$1"
+	alg="$2"
+	input="$3"
+
+	col=$(cat $basefile | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<=NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+	echo -n "$alg-$input	" >> "$outfile"
+	awk "/$input/{print \$($col+1)}" ORS='\t' `ls -tr --time=birth *.txt` >> "$outfile"
+	echo "" >> "$outfile"
+}
+
+# genfor "rand" data.csv
+genfor() {
+	inp="$1"
+	outfile="$2"
+	header "$outfile"
+	for alg in "${sortalgs[@]}"; do
+		echo -n "Adding $alg-"; echo "$inp"
+		fill "$outfile" "$alg" "$inp"
+	done
+}
+
+# generate data.csv
+generate() {
+	outfile="$1"
+	header "$outfile"
+
+	for inp in "${definputs[@]}"; do
+		genfor "$inp" "$outfile"
+	done
+}
+
+# cleanup data.csv prepared.csv
+cleanup() {
+	in="$1"
+	out="$2"
+	sed "s/\([0-9][0-9]*\)\.\([0-9][0-9]*\)s*/\1,\2/g" "$in" > "$out"
+}
--- a/num1.awk
+++ b/num1.awk
@ -0,0 +1 @@
+function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}
--- a/num2.awk
+++ b/num2.awk
@ -0,0 +1,19 @@
+function hn(x) {
+	gsub(",", ".", x);
+	return x+0
+}
+
+function ihn(x) {
+	gsub("\\.", ",", x);
+	return x
+}
+
+BEGIN {
+	getline;
+	for(i=1; i<=NF; ++i) saved[i]=$i;
+	print $0
+}
+
+{
+	for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)
+}
--- a/parancs.awk
+++ b/parancs.awk
@ -0,0 +1 @@
+{print $2}
--- a/pelda.asd
+++ b/pelda.asd
@ -0,0 +1,8 @@
+Errors, at:
+1          2        3       4
+4pasu      copy     frewr   gptbuck
+Warnings:
+14:22 turned off machine
+21:38 file is written with bad group
+23:22 turned off machine
+23:42 file is written with bad group
--- a/steps.sh
+++ b/steps.sh
@ -0,0 +1,189 @@
+#!/bin/bash
+
+# We have a bunch of these files
+cat 5000000.txt 
+ls *.txt
+
+# And we want graph-able output data like this:
+# alg\n          10      100    1000..
+# magyar-rand    0,005   0,03   0,3...
+# magyar-worst   0,007   0,06   0,4...
+# 4pasu-rand     0,017   0,11   0,7...
+# 4pasu-worst    0,0237  0,42   1,3...
+
+# One can get started thinking grep + sed as usual, but it can become tedious
+# At least we can grab the number of elements from the file itself (ensures filename is not bad)
+cat 5000000.txt | grep Sorting | sed "s/.* \([0-9]*\) .*/\1/"
+
+# But enter AWK!
+awk '{print $1}' 5000000.txt
+awk '{if(n =="") n = $2} END{print n}' 5000000.txt
+awk 'BEGIN{getline; print $2}' 5000000.txt
+
+# More interesting stuff
+awk '{if(NR % 2 == 0) elemek[NR/2] = $2} END{for(i = 1; i <= NR/2; ++i) print elemek[i]}' 5000000.txt
+awk '{getline; print $2}' 5000000.txt
+
+# And to be fancy (also showing ls can use creation time nowadays, check without to see bad stuff)
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -t --time=birth *.txt`
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -tr --time=birth *.txt`
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -t *.txt`
+
+######################################
+# Column to row translation with AWK #
+######################################
+
+# To get every second field with awk its really simple. OFS is output sep, FS is input sep
+awk '{print $2}' 5000000.txt
+
+# This is closer to what we want, because the last row starts elsewhere, but duplicates junk
+awk '{print $2; last=$1} END {print last}' 5000000.txt
+
+# This way we never print while processing line until END, just manually afterwards with loop - what we want
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=1;i<=NR;i++) print out[i]}' 5000000.txt
+
+# This leaves out the first few lines but is the same
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' 5000000.txt
+
+# This way, we write output as tab-separated
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' ORS='\t' 5000000.tx
+# Alternative: can change ORS (output record separator) on the fly. You can do with OFS and FS too and RS too!
+awk '{out[NR]=$2; first=$1} END {ORS="\t"; out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' 5000000.txt
+
+# I wanted to omit empty lines of the input
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i]}' ORS='\t' 5000000.txt
+
+# A way to store the first lines second column (n) into the last position near alg name
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i]; print n}' ORS='\t' 5000000.txt
+
+# NOT what I want (but want to show)
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i] n}' ORS='\t' 5000000.txt
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<NR;i++) if(out[i] != "") print out[i]; print out[i] n}' ORS='\t' 5000000.txt
+
+# More bash-friendly this will be - just so that I can use bash's variables in AWK from now on
+awk "{if(n == \"\") n = \$2; out[NR]=\$2; first=\$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != \"\") print out[i]; print n}" ORS="\t" 5000000.txt
+
+######################################
+# Getting the index of the algorithm #
+######################################
+
+# This gets the line of the algorithms
+cat 5000000.txt | grep copy | sed 's/^\s*//'
+
+# Get the column (column index) of the given algorithm
+# RS is used here instead of FS, because I want tabbed values as records to count them;
+# BEWARE: ' +' because its not the special (usually line end) and not FS where ' ' mean 'any number of whitespace' and '[ ]' single
+alg='magyar'
+cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +'
+
+# Get value in bash variable (backtick fails, so do $(..) instead
+col=$(cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+echo "$col"
+
+#########################################################################
+# Getting row-column based value from file + add extracted 'n' above it #
+#########################################################################
+
+# So we already have $col, for alg=magyar, how to get value for input="rand" for example?
+# This is really simple and wanted to show this
+
+alg='magyar'
+input='rand'
+
+col=$(cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+input='rand'
+awk "{if(\$1 == \"$input\") print \$($col+1)}" 5000000.txt
+cat 5000000.txt # check
+
+# Or actually even simpler if you regex-search to position with AWK
+awk "/rand/{print \$($col+1)}" 5000000.txt
+awk "/$input/{print \$($col+1)}" 5000000.txt
+
+# So we can write out a CSV list of values too after a search actually!
+# This is maybe not working as you expect, and you can start thinking about getlines, loops, double searches, etc
+# Because a block without any prefix runs again for the whole file...
+awk "/asc/{print \$($col+1)} {print \$($col+1)}" ORS='\t' 5000000.txt
+# But this is the way - because variables can be "flags" for the blocks!
+# This writes out every target column from when ascdesc was found (including) and until descdesc (not including)
+awk "/ascdesc/{flag=1} /descdesc/{flag=0} flag{print \$($col+1)}" ORS=',' 5000000.txt
+
+# Yeah... But didn't we want data like this?
+#
+# alg\n          10      100    1000..
+# magyar-rand    0,005   0,03   0,3...
+# magyar-worst   0,007   0,06   0,4...
+# 4pasu-rand     0,017   0,11   0,7...
+# 4pasu-worst    0,0237  0,42   1,3...
+
+# Lets put it together
+
+header() {
+	outfile=$1
+
+	echo -n 'alg	' > $outfile
+	awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' ORS='\t' `ls -tr --time=birth *.txt` | sed 's/,$//' >> $outfile
+}
+header data.csv
+
+# Just look at this awsome... we do not even need to leave AWK to save which is the column index!!!
+awk "/worst/{getline; last=1} last{for(i=1;i<=NF;++i) if (\$i == \"$alg\") col=i} END{print col}" 5000000.txt
+
+# Lets put together really this too for how to fill files
+
+basefile=5000000.txt
+# fill data.csv magyar rand
+fill() {
+	outfile=$1
+	alg=$2
+	input=$3
+
+	col=$(cat $basefile | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<=NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+	echo -n "$alg-$input	" >> $outfile
+	awk "/$input/{print \$($col+1)}" ORS='\t' `ls -tr --time=birth *.txt` >> $outfile
+	echo "" >> $outfile
+}
+
+# See: combinator.inc
+
+######################
+# Dividing data by n #
+######################
+
+# Look at this random other file
+awk '{for(i=1; i<=NF; ++i) print $i}' a.num
+
+# We can save every columns data / line if we want just simply - so save first line's data
+# Rem.: The getline in begin removes that line from the later block! Yes...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i} {for(i=1; i<=NF; ++i) print saved[i]}' a.num
+
+# Instead of printing real data (except for i==1) make it be divided by n (the first line / row and same col position)
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) print $i; else print ($i/saved[i])}' a.num
+
+# Try tab-separated (but well... turns out this becomes a single line now)
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) print $i; else print ($i/saved[i])}' ORS='\t' a.num
+
+# Use printf and manual ORS/OFS
+# Nearly...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' a.num
+# Working...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' a.num
+
+# But what about the 'Hungarian' floating point numbers with comma instead of dot?
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# Solvable (enter AKH helper functions)
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", (hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# But the above uses dot again - so convert back
+# WARNING:
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+# GOOD:
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# But at this complexity often you put things into external .awk file
+awk -f num1.awk OFS='\t' b.num
+
+# Can better organize at that point
+awk -f num2.awk OFS='\t' b.num
				`@ -0,0 +1 @@`
				`function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}`