From 05b6d9a40cd00985765feee99bcede496f3cebca Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Sun, 12 Oct 2025 12:07:08 +0200
Subject: [PATCH] main content

---
 a.num          |   3 +
 b.num          |   3 +
 combinator.inc |  68 ++++++++++++++++++
 num1.awk       |   1 +
 num2.awk       |  19 +++++
 parancs.awk    |   1 +
 pelda.asd      |   8 +++
 steps.sh       | 189 +++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 292 insertions(+)
 create mode 100644 a.num
 create mode 100644 b.num
 create mode 100644 combinator.inc
 create mode 100644 num1.awk
 create mode 100644 num2.awk
 create mode 100644 parancs.awk
 create mode 100644 pelda.asd
 create mode 100644 steps.sh

diff --git a/a.num b/a.num
new file mode 100644
index 0000000..d7dea05
--- /dev/null
+++ b/a.num
@@ -0,0 +1,3 @@
+n	64	100	1024	4096
+k1	0.5	0.3	0.42	0.88
+k2	0.8	0.2	0.76	0.62
diff --git a/b.num b/b.num
new file mode 100644
index 0000000..b27d3ae
--- /dev/null
+++ b/b.num
@@ -0,0 +1,3 @@
+n	64	100	1024	4096
+k1	0,5	0,3	0,42	0,88
+k2	0,8	0,2	0,76	0,62
diff --git a/combinator.inc b/combinator.inc
new file mode 100644
index 0000000..10124ae
--- /dev/null
+++ b/combinator.inc
@@ -0,0 +1,68 @@
+# Just . combinator.inc or source combinator.inc
+# After that,
+echo "You can (manually):"
+echo "- header data.csv"
+echo "- fill data.csv magyar rand"
+echo "Or create comparison for a specific data kind:"
+echo "- genfor rand data.csv"
+echo "Or just a big default mess:"
+echo "- generate data.csv"
+echo "To cleanup data for libreoffice calc (hungarian one that is):"
+echo "- cleanup data.csv"
+echo ""
+echo "The generate gives a 'default set' that you can add your missing stuff with further 'fill' commands if needed"
+
+basefile=5000000.txt
+
+declare -a definputs=("worst" "smallrange" "rand" "constant")
+declare -a sortalgs=(`awk '/worst/{getline; last=1} last{for(x=1;x<=NF;++x) print $x}' ORS=' ' $basefile`)
+
+# header data.csv
+header() {
+	outfile="$1"
+
+	echo -n 'alg	' > "$outfile"
+	awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' ORS='\t' `ls -tr --time=birth *.txt` | sed 's/,$//' >> "$outfile"
+	echo "" >> "$outfile"
+}
+
+# fill data.csv magyar rand
+fill() {
+	outfile="$1"
+	alg="$2"
+	input="$3"
+
+	col=$(cat $basefile | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<=NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+	echo -n "$alg-$input	" >> "$outfile"
+	awk "/$input/{print \$($col+1)}" ORS='\t' `ls -tr --time=birth *.txt` >> "$outfile"
+	echo "" >> "$outfile"
+}
+
+# genfor "rand" data.csv
+genfor() {
+	inp="$1"
+	outfile="$2"
+	header "$outfile"
+	for alg in "${sortalgs[@]}"; do
+		echo -n "Adding $alg-"; echo "$inp"
+		fill "$outfile" "$alg" "$inp"
+	done
+}
+
+# generate data.csv
+generate() {
+	outfile="$1"
+	header "$outfile"
+
+	for inp in "${definputs[@]}"; do
+		genfor "$inp" "$outfile"
+	done
+}
+
+# cleanup data.csv prepared.csv
+cleanup() {
+	in="$1"
+	out="$2"
+	sed "s/\([0-9][0-9]*\)\.\([0-9][0-9]*\)s*/\1,\2/g" "$in" > "$out"
+}
diff --git a/num1.awk b/num1.awk
new file mode 100644
index 0000000..c650361
--- /dev/null
+++ b/num1.awk
@@ -0,0 +1 @@
+function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}
diff --git a/num2.awk b/num2.awk
new file mode 100644
index 0000000..b9ca270
--- /dev/null
+++ b/num2.awk
@@ -0,0 +1,19 @@
+function hn(x) {
+	gsub(",", ".", x);
+	return x+0
+}
+
+function ihn(x) {
+	gsub("\\.", ",", x);
+	return x
+}
+
+BEGIN {
+	getline;
+	for(i=1; i<=NF; ++i) saved[i]=$i;
+	print $0
+}
+
+{
+	for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)
+}
diff --git a/parancs.awk b/parancs.awk
new file mode 100644
index 0000000..2d182d1
--- /dev/null
+++ b/parancs.awk
@@ -0,0 +1 @@
+{print $2}
diff --git a/pelda.asd b/pelda.asd
new file mode 100644
index 0000000..c54fa78
--- /dev/null
+++ b/pelda.asd
@@ -0,0 +1,8 @@
+Errors, at:
+1          2        3       4
+4pasu      copy     frewr   gptbuck
+Warnings:
+14:22 turned off machine
+21:38 file is written with bad group
+23:22 turned off machine
+23:42 file is written with bad group
diff --git a/steps.sh b/steps.sh
new file mode 100644
index 0000000..bc6b239
--- /dev/null
+++ b/steps.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+
+# We have a bunch of these files
+cat 5000000.txt 
+ls *.txt
+
+# And we want graph-able output data like this:
+# alg\n          10      100    1000..
+# magyar-rand    0,005   0,03   0,3...
+# magyar-worst   0,007   0,06   0,4...
+# 4pasu-rand     0,017   0,11   0,7...
+# 4pasu-worst    0,0237  0,42   1,3...
+
+# One can get started thinking grep + sed as usual, but it can become tedious
+# At least we can grab the number of elements from the file itself (ensures filename is not bad)
+cat 5000000.txt | grep Sorting | sed "s/.* \([0-9]*\) .*/\1/"
+
+# But enter AWK!
+awk '{print $1}' 5000000.txt
+awk '{if(n =="") n = $2} END{print n}' 5000000.txt
+awk 'BEGIN{getline; print $2}' 5000000.txt
+
+# More interesting stuff
+awk '{if(NR % 2 == 0) elemek[NR/2] = $2} END{for(i = 1; i <= NR/2; ++i) print elemek[i]}' 5000000.txt
+awk '{getline; print $2}' 5000000.txt
+
+# And to be fancy (also showing ls can use creation time nowadays, check without to see bad stuff)
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -t --time=birth *.txt`
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -tr --time=birth *.txt`
+awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' `ls -t *.txt`
+
+######################################
+# Column to row translation with AWK #
+######################################
+
+# To get every second field with awk its really simple. OFS is output sep, FS is input sep
+awk '{print $2}' 5000000.txt
+
+# This is closer to what we want, because the last row starts elsewhere, but duplicates junk
+awk '{print $2; last=$1} END {print last}' 5000000.txt
+
+# This way we never print while processing line until END, just manually afterwards with loop - what we want
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=1;i<=NR;i++) print out[i]}' 5000000.txt
+
+# This leaves out the first few lines but is the same
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' 5000000.txt
+
+# This way, we write output as tab-separated
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' ORS='\t' 5000000.tx
+# Alternative: can change ORS (output record separator) on the fly. You can do with OFS and FS too and RS too!
+awk '{out[NR]=$2; first=$1} END {ORS="\t"; out[NR]=first; for(i=3;i<=NR;i++) print out[i]}' 5000000.txt
+
+# I wanted to omit empty lines of the input
+awk '{out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i]}' ORS='\t' 5000000.txt
+
+# A way to store the first lines second column (n) into the last position near alg name
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i]; print n}' ORS='\t' 5000000.txt
+
+# NOT what I want (but want to show)
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != "") print out[i] n}' ORS='\t' 5000000.txt
+awk '{if(n == "") n = $2; out[NR]=$2; first=$1} END {out[NR]=first; for(i=3;i<NR;i++) if(out[i] != "") print out[i]; print out[i] n}' ORS='\t' 5000000.txt
+
+# More bash-friendly this will be - just so that I can use bash's variables in AWK from now on
+awk "{if(n == \"\") n = \$2; out[NR]=\$2; first=\$1} END {out[NR]=first; for(i=3;i<=NR;i++) if(out[i] != \"\") print out[i]; print n}" ORS="\t" 5000000.txt
+
+######################################
+# Getting the index of the algorithm #
+######################################
+
+# This gets the line of the algorithms
+cat 5000000.txt | grep copy | sed 's/^\s*//'
+
+# Get the column (column index) of the given algorithm
+# RS is used here instead of FS, because I want tabbed values as records to count them;
+# BEWARE: ' +' because its not the special (usually line end) and not FS where ' ' mean 'any number of whitespace' and '[ ]' single
+alg='magyar'
+cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +'
+
+# Get value in bash variable (backtick fails, so do $(..) instead
+col=$(cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+echo "$col"
+
+#########################################################################
+# Getting row-column based value from file + add extracted 'n' above it #
+#########################################################################
+
+# So we already have $col, for alg=magyar, how to get value for input="rand" for example?
+# This is really simple and wanted to show this
+
+alg='magyar'
+input='rand'
+
+col=$(cat 5000000.txt | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+input='rand'
+awk "{if(\$1 == \"$input\") print \$($col+1)}" 5000000.txt
+cat 5000000.txt # check
+
+# Or actually even simpler if you regex-search to position with AWK
+awk "/rand/{print \$($col+1)}" 5000000.txt
+awk "/$input/{print \$($col+1)}" 5000000.txt
+
+# So we can write out a CSV list of values too after a search actually!
+# This is maybe not working as you expect, and you can start thinking about getlines, loops, double searches, etc
+# Because a block without any prefix runs again for the whole file...
+awk "/asc/{print \$($col+1)} {print \$($col+1)}" ORS='\t' 5000000.txt
+# But this is the way - because variables can be "flags" for the blocks!
+# This writes out every target column from when ascdesc was found (including) and until descdesc (not including)
+awk "/ascdesc/{flag=1} /descdesc/{flag=0} flag{print \$($col+1)}" ORS=',' 5000000.txt
+
+# Yeah... But didn't we want data like this?
+#
+# alg\n          10      100    1000..
+# magyar-rand    0,005   0,03   0,3...
+# magyar-worst   0,007   0,06   0,4...
+# 4pasu-rand     0,017   0,11   0,7...
+# 4pasu-worst    0,0237  0,42   1,3...
+
+# Lets put it together
+
+header() {
+	outfile=$1
+
+	echo -n 'alg	' > $outfile
+	awk 'BEGINFILE{n=""} {if(n =="") n = $2} ENDFILE{print n}' ORS='\t' `ls -tr --time=birth *.txt` | sed 's/,$//' >> $outfile
+}
+header data.csv
+
+# Just look at this awsome... we do not even need to leave AWK to save which is the column index!!!
+awk "/worst/{getline; last=1} last{for(i=1;i<=NF;++i) if (\$i == \"$alg\") col=i} END{print col}" 5000000.txt
+
+# Lets put together really this too for how to fill files
+
+basefile=5000000.txt
+# fill data.csv magyar rand
+fill() {
+	outfile=$1
+	alg=$2
+	input=$3
+
+	col=$(cat $basefile | grep copy | sed 's/^\s*//' | awk "{out[NR] = \$1} END {for(i=1;i<=NR;i++) if(out[i] == \"$alg\") print i;}" RS=' +')
+
+	echo -n "$alg-$input	" >> $outfile
+	awk "/$input/{print \$($col+1)}" ORS='\t' `ls -tr --time=birth *.txt` >> $outfile
+	echo "" >> $outfile
+}
+
+# See: combinator.inc
+
+######################
+# Dividing data by n #
+######################
+
+# Look at this random other file
+awk '{for(i=1; i<=NF; ++i) print $i}' a.num
+
+# We can save every columns data / line if we want just simply - so save first line's data
+# Rem.: The getline in begin removes that line from the later block! Yes...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i} {for(i=1; i<=NF; ++i) print saved[i]}' a.num
+
+# Instead of printing real data (except for i==1) make it be divided by n (the first line / row and same col position)
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) print $i; else print ($i/saved[i])}' a.num
+
+# Try tab-separated (but well... turns out this becomes a single line now)
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) print $i; else print ($i/saved[i])}' ORS='\t' a.num
+
+# Use printf and manual ORS/OFS
+# Nearly...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i){saved[i]=$i; print $i}} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' a.num
+# Working...
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' a.num
+
+# But what about the 'Hungarian' floating point numbers with comma instead of dot?
+awk 'BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ($i/saved[i]), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# Solvable (enter AKH helper functions)
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", (hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# But the above uses dot again - so convert back
+# WARNING:
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+# GOOD:
+awk 'function hn(x) { gsub(",", ".", x); return x+0 } function ihn(x) { gsub("\\.", ",", x); return x } BEGIN{getline; for(i=1; i<=NF; ++i) saved[i]=$i; print $0} {for(i=1; i<=NF; ++i) if(i == 1) printf "%s%s", $i, OFS; else printf "%s%s", ihn(hn($i)/hn(saved[i])), (i==NF ? ORS : OFS)}' OFS='\t' b.num
+
+# But at this complexity often you put things into external .awk file
+awk -f num1.awk OFS='\t' b.num
+
+# Can better organize at that point
+awk -f num2.awk OFS='\t' b.num