Browse Source

cursed WTT extractor

master
eta 2 months ago
parent
commit
a0695322ba
  1. 165
      wtt.lisp

165
wtt.lisp

@ -0,0 +1,165 @@
(defpackage :wtt
(:use :cl))
(in-package :wtt)
(defparameter *wtt-district* "/home/eta/Downloads/wtt-151-dis.pdf")
(defun run-pdftotext (timetable page x y width height)
"Runs pdftotext to extract data from PAGE of TIMETABLE, with a bounding box starting at (X, Y) with dimensions (WIDTH, HEIGHT)."
(split-sequence:split-sequence
#\Newline
(string-trim
'(#\Space #\Return #\Newline #\Tab)
(uiop:run-program
`("pdftotext" "-r" "300"
"-f" ,(princ-to-string page)
"-l" ,(princ-to-string page)
"-x" ,(princ-to-string x)
"-y" ,(princ-to-string y)
"-W" ,(princ-to-string width)
"-H" ,(princ-to-string height)
"-nopgbrk"
,timetable
"-")
:error-output *error-output*
:output :string))))
(defun run-pdftotext-with-bbox (timetable page bbox)
(apply #'run-pdftotext (append (list timetable page) bbox)))
(defparameter *header-left-bbox* '(148 190 854 97))
(defparameter *header-right-bbox* '(1475 187 854 97))
(defparameter *train-data-top-y* 300)
(defparameter *train-data-bottom-y* 1820)
(defparameter *train-data-x* 548)
(defparameter *train-data-width* 90)
(defparameter *train-data-per-row* 20)
(defparameter *train-no-delta* 63)
(defparameter *train-no-delta* 63)
(defparameter *deltas* nil)
(defun read-one-train (timetable page n &optional bottom)
(let ((x (+ *train-data-x*
(* n *train-data-width*)))
(y (if bottom
*train-data-bottom-y*
*train-data-top-y*)))
(loop
for (keyword . y-delta) in *deltas*
append (list
(cons keyword
(run-pdftotext timetable page x y *train-data-width* y-delta)))
do (incf y y-delta))))
(defparameter *deltas-eastbound*
'((:train-no . 63)
(:trip-no . 49)
(:notes . 97)
(:eby-pfm . 35)
(:eby . 25)
(:eac . 25)
(:eac-depot . 25)
(:acton-town . 35)
(:richmond . 35)
(:turnham-green . 25)
(:hammersmith . 25)
(:west-kensington . 38)
(:olympia . 49)
(:wimbledon . 35)
(:east-putney . 25)
(:putney-bridge . 25)
(:parsons-green . 25)
(:earls-court . 25)
(:earls-court-pfm . 35)
(:hsk . 35)
(:hsk-pfm . 25)
(:edgware-road . 25)
(:edgware-road-pfm . 35)
(:hsk-inner-rail . 35)
(:gloucester-road . 25)
(:south-kensington . 25)
(:embankment . 25)
(:mansion-house . 25)
(:tower-hill . 25)
(:aldgate . 35)
(:liverpool-street . 35)
(:aldgate-east . 25)
(:whitechapel . 25)
(:west-ham . 35)
(:plaistow . 35)
(:barking . 25)
(:barking-sidings . 25)
(:dagenham-east . 25)
(:upminster . 25)
(:upminster-pfm . 25)
(:upminster-depot . 60)
(:to-form . 60)))
(defparameter *deltas-westbound*
'((:train-no . 63)
(:trip-no . 49)
(:notes . 97)
(:upminster-depot . 35)
(:upminster-pfm . 25)
(:upminster . 25)
(:dagenham-east . 25)
(:barking-sidings . 25)
(:barking . 25)
(:plaistow . 35)
(:west-ham . 35)
(:whitechapel . 25)
(:aldgate-east . 25)
(:liverpool-street . 35)
(:aldgate . 35)
(:tower-hill . 25)
(:mansion-house . 25)
(:embankment . 25)
(:south-kensington . 25)
(:gloucester-road-pfm-1 . 25)
(:gloucester-road-pfm-2 . 25)
(:hsk-outer-rail . 35)
(:edgware-road-pfm . 35)
(:edgware-road . 25)
(:hsk-pfm . 25)
(:hsk . 35)
(:earls-court-pfm . 35)
(:earls-court . 25)
(:parsons-green . 25)
(:putney-bridge . 25)
(:east-putney . 25)
(:wimbledon . 35)
(:olympia . 49)
(:west-kensington . 38)
(:hammersmith . 25)
(:turnham-green . 25)
(:richmond . 35)
(:acton-town . 35)
(:eac-depot . 25)
(:eac . 25)
(:eby . 25)
(:eby-pfm . 60)
(:to-form . 45)))
(defun read-one-page (timetable page)
(let* ((header-left (run-pdftotext-with-bbox timetable page *header-left-bbox*))
(header-right (run-pdftotext-with-bbox timetable page *header-right-bbox*))
(*deltas* (if (search "EASTBOUND" (first header-left))
*deltas-eastbound*
*deltas-westbound*)))
(append
(list page header-left header-right)
(loop
for col from 0 below *train-data-per-row*
append (list (read-one-train timetable page col)))
(loop
for col from 0 below *train-data-per-row*
append (list (read-one-train timetable page col t))))))
(defun main ()
(loop
for page from 19 upto 160
do (format t "~&Processing page ~A~%" page)
(cpk:encode-to-file
(read-one-page "./wtt-district.pdf" page)
(format nil "./wtt-district-~A.cpk" page))))
Loading…
Cancel
Save