Daniel Lemire wrote an article about generating scripts in C++ using an LLM. Producing useful commands on the go using C++ and AI

The genrated code looks like this:

// assume clang++ 18 or g++ 13
// compile with: c++ -std=c++23 -o csv csv.cpp
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <map>
#include <print>
#include <ranges>
#include <sstream>
#include <string>
#include <unordered_set>

int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::println("Usage: {} <csv_file>", argv[0]);
    return EXIT_FAILURE;
  }
  std::ifstream file(argv[1]);
  if (!file.is_open()) {
    std::println("Error: Cannot open file {}\n", argv[1]);
    return EXIT_FAILURE;
  }
  // Read first line for column labels
  std::string line;
  if (!std::getline(file, line)) {
    std::println("Error: Empty file {}", argv[1]);
    return EXIT_FAILURE;
  }

  // Parse first line into labels
  std::map<size_t, std::string> labels;
  auto cells = line | std::ranges::views::split(',');
  for (auto cell : cells) {
    std::string label(cell.begin(), cell.end());
    labels[labels.size()] = label;
  }
  std::map<std::string, std::unordered_set<std::string>> columns;
  while (std::getline(file, line)) {
    auto cells = line | std::ranges::views::split(',');
    for (auto [idx, cell] : std::ranges::views::enumerate(cells)) {
      columns[labels[idx]].insert(std::string(cell.begin(), cell.end()));
    }
  }
  // Print results using labels
  for (const auto &[label, values] : columns) {
    std::println("Column {}: {} distinct values", label, values.size());
  }
  return EXIT_SUCCESS;
}

I usually use C to write small scripts, I decided to rewrite the code above in C using sheepy:

#! /usr/bin/env sheepy
#include "libsheepyObject.h"

#define getJ(obj, key) getG(obj, rtSmallJsont, key)
#define getC(obj, key) getG(obj, rtChar, key)

int main(int argc, char** argv) {

    if (argc != 2) {
        logE("Usage: %s <csv file>", argv[0]);
        return EXIT_FAILURE;
    }
    cleanAllocateSmallJson(file);
    if (not readTextG(file, argv[1])) {
        logE("Cannot open file %s", argv[1]);
        return EXIT_FAILURE;
    }
    // Read first line for column labels
    cleanFinishSmallJsonP(line) = getJ(file, 0);
    if (not line) {
        logE("Empty file %s",argv[1]);
        return EXIT_FAILURE;
    }

    // Parse first line into labels
    cleanSmallJsonP(labels) = splitG(line, ',');
    delG(file, 0, 1);
    cleanAllocateSmallJson(columns);
    range(i, lenG(labels)) {
        createSmallArray(a);
        pushG(columns, &a);
    }
    iter(file, L) {
        castS(l, L);
        cleanSmallArrayP(cells) = splitG(l, ',');
        range(i, lenG(labels)) {
            cleanFinishSmallJsonP(col) = getJ(columns, i);
            var c = getC(cells, i);
            if (not hasG(col, c)) {
                pushG(col, c);
                setPG(columns, i, col);
            }
        }
    }
    // Print results using labels
    iter(labels, lb) {
        cleanFinishSmallJsonP(col) = getJ(columns, iterIndexG(labels));
        logI("Column %s: %d distinct values", ssGet(lb), lenG(col));
    }
    return EXIT_SUCCESS;
}
// vim: set expandtab ts=2 sw=2:

I wonder if an LLM would be able to write this code since OpenAI and Google crawled on all the code in my libsheepy.

  • The clean* macros use the gnu c __cleanup attribute to call free when the variable is out of scope.
  • smallJsont is a type which can be a bool, integer, string, array or dictionary.
  • getG is a macro which uses c11 __Generic to call a function depending on the parameter types.
  • iter is a macro which calls the iterator interface of the object in first parameter.

Sheepy (gemini) sheepy (http)

Hashtag: #programming