LAL: Linear Arrangement Library 24.10.00
A library focused on algorithms on linear arrangements of graphs.
Loading...
Searching...
No Matches
check_correctness.hpp
1/*********************************************************************
2 *
3 * Linear Arrangement Library - A library that implements a collection
4 * algorithms for linear arrangments of graphs.
5 *
6 * Copyright (C) 2019 - 2024
7 *
8 * This file is part of Linear Arrangement Library. The full code is available
9 * at:
10 * https://github.com/LAL-project/linear-arrangement-library.git
11 *
12 * Linear Arrangement Library is free software: you can redistribute it
13 * and/or modify it under the terms of the GNU Affero General Public License
14 * as published by the Free Software Foundation, either version 3 of the
15 * License, or (at your option) any later version.
16 *
17 * Linear Arrangement Library is distributed in the hope that it will be
18 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Affero General Public License for more details.
21 *
22 * You should have received a copy of the GNU Affero General Public License
23 * along with Linear Arrangement Library. If not, see <http://www.gnu.org/licenses/>.
24 *
25 * Contact:
26 *
27 * LluĂ­s Alemany Puig (lluis.alemany.puig@upc.edu)
28 * LQMC (Quantitative, Mathematical, and Computational Linguisitcs)
29 * CQL (Complexity and Quantitative Linguistics Lab)
30 * Jordi Girona St 1-3, Campus Nord UPC, 08034 Barcelona. CATALONIA, SPAIN
31 * Webpage: https://cqllab.upc.edu/people/lalemany/
32 *
33 * Ramon Ferrer i Cancho (rferrericancho@cs.upc.edu)
34 * LQMC (Quantitative, Mathematical, and Computational Linguisitcs)
35 * CQL (Complexity and Quantitative Linguistics Lab)
36 * Office 220, Omega building
37 * Jordi Girona St 1-3, Campus Nord UPC, 08034 Barcelona. CATALONIA, SPAIN
38 * Webpage: https://cqllab.upc.edu/people/rferrericancho/
39 *
40 ********************************************************************/
41
42// C includes
43#include <omp.h>
44
45// C++ includes
46#if defined DEBUG
47#include <cassert>
48#endif
49#include <filesystem>
50#include <charconv>
51#include <fstream>
52#include <sstream>
53#include <string>
54
55// lal includes
56#include <lal/basic_types.hpp>
57#include <lal/io/head_vector_error.hpp>
58#include <lal/io/treebank_file_error.hpp>
59#include <lal/io/treebank_file_report.hpp>
60#include <lal/io/treebank_collection_report.hpp>
61#include <lal/graphs/directed_graph.hpp>
62#include <lal/detail/graphs/cycles.hpp>
63#include <lal/detail/graphs/conversions.hpp>
64
65namespace lal {
66namespace detail {
67
68#define file_does_not_exist(F) \
69"Error: Treebank '" + F + "' does not exist."
70
71#define file_could_not_be_opened(F) \
72"Error: Treebank '" + F + "' could not be opened."
73
74#define invalid_integer(i, chunk) \
75"Error: Value at position '" + std::to_string(i) + "' (value: '" + chunk + "') \
76is not a valid non-negative integer number."
77
78#define head_out_of_bounds(i) \
79"Error: Head index at position '" + std::to_string(i) + "' (value: \
80" + std::to_string(hv[i]) + ") is out of bounds."
81
82#define wrong_num_roots(r) \
83"Error: Wrong number of roots: " + std::to_string(n_roots) + "."
84
85#define wrong_num_edges(n, m) \
86"Error: Wrong number of edges. Number of vertices is '" + std::to_string(n) + \
87 "'. Number of edges is '" + std::to_string(m) + "'; " + \
88 "should be '" + std::to_string(n-1) + "'."
89
90#define graph_has_cycles_msg \
91"Error: The graph described is not a tree, i.e., it has cycles."
92
93#define isolated_vertex(u) \
94"Error: Vertex '" + std::to_string(u) + "' is isolated."
95
96#define self_loop(pos) \
97"Error: found a self-loop at position '" + std::to_string(pos) + "'."
98
107template <bool decide>
108[[nodiscard]] std::conditional_t<decide, bool, std::vector<io::head_vector_error>>
110noexcept
111{
112 std::vector<io::head_vector_error> error_list;
113
114 // number of nodes of the graph
115 const uint64_t n = hv.size();
116
117 uint64_t n_roots = 0;
118 bool can_make_graph = true;
119
120 // inspect the head vector
121 for (std::size_t i = 0; i < hv.size(); ++i) {
122 if (hv[i] == 0) {
123 ++n_roots;
124 continue;
125 }
126
127 // ensure that the number is within bounds
128 if (hv[i] > hv.size()) {
129 if constexpr (decide) { return true; }
130 else {
131 error_list.emplace_back(
132 head_out_of_bounds(i),
134 );
135 can_make_graph = false;
136 }
137 }
138 // self loop
139 else if (hv[i] == i + 1) {
140 if constexpr (decide) { return true; }
141 else {
142 error_list.emplace_back(
143 self_loop(i + 1),
145 );
146 can_make_graph = false;
147 }
148 }
149 }
150
151 // check there is exactly one root
152 if (n_roots != 1) {
153 if constexpr (decide) { return true; }
154 else {
155 error_list.emplace_back(
156 wrong_num_roots(n_roots),
158 );
159 }
160 }
161
162 if (can_make_graph) {
163 // ignore singleton graphs
164 if (n == 1) {
165 if constexpr (decide) { return false; }
166 else { return error_list; }
167 }
168
169 // make a directed graph with the values
170 const auto dgraph = from_head_vector_to_graph<graphs::directed_graph>(hv,false,false);
171 {
172 const bool has_cycles = has_undirected_cycles(dgraph);
173 if (has_cycles) {
174 if constexpr (decide) { return true; }
175 else {
176 error_list.emplace_back(
177 graph_has_cycles_msg,
179 );
180 }
181 }
182 }
183
184 // find isolated vertices
185 for (node u = 0; u < dgraph.get_num_nodes(); ++u) {
186 if (dgraph.get_degree(u) == 0) {
187 if constexpr (decide) { return true; }
188 else {
189 error_list.emplace_back(
190 isolated_vertex(u),
192 );
193 }
194 }
195 }
196
197 // check the number of edges is correct
198 if (dgraph.get_num_edges() != dgraph.get_num_nodes() - 1) {
199 if constexpr (decide) { return true; }
200 else {
201 error_list.emplace_back(
202 wrong_num_edges(dgraph.get_num_nodes(), dgraph.get_num_edges()),
204 );
205 }
206 }
207 }
208
209 if constexpr (decide) { return false; }
210 else { return error_list;}
211}
212
219template <bool decide>
220[[nodiscard]] std::conditional_t<decide, bool, std::vector<io::head_vector_error>>
221find_errors(const std::string& current_line)
222noexcept
223{
224 std::vector<io::head_vector_error> error_list;
225
226 bool non_numeric_characters = false;
227 head_vector hv;
228
229 // ensure there are only numeric characters
230 {
231 std::size_t i = 1;
232 std::stringstream ss(current_line);
233 std::string chunk;
234 while (ss >> chunk) {
235
236 uint64_t value;
237 const auto result = std::from_chars
238 (&chunk[0], (&chunk[chunk.size() - 1]) + 1, value);
239
240 if (result.ec == std::errc::invalid_argument) {
241 if constexpr (decide) { return true; }
242 else {
243 error_list.emplace_back(
244 invalid_integer(i, chunk),
246 );
247 non_numeric_characters = true;
248 }
249 }
250 else {
251 hv.push_back(value);
252 }
253
254 ++i;
255 }
256 }
257
258 // if the current line contains non-numeric characters then the appropriate
259 // error messages have been stored, and so we can skip to the next line
260 if (non_numeric_characters) {
261 if constexpr (decide) { return true; }
262 else { return error_list; }
263 }
264
265 // if the program reaches this point, the vector 'treebank_err_list' is empty.
266#if defined DEBUG
267 assert(error_list.size() == 0);
268#endif
269 return find_errors<decide>(hv);
270}
271
278template <bool decide>
279[[nodiscard]] std::conditional_t<decide, bool, io::treebank_file_report>
280check_correctness_treebank(const std::string& treebank_filename)
281noexcept
282{
283 if (not std::filesystem::exists(treebank_filename)) {
284 if constexpr (decide) { return true; }
285 else {
287 file_does_not_exist(treebank_filename),
289 });
290 }
291 }
292
293 std::ifstream fin(treebank_filename);
294 if (not fin.is_open()) {
295 if constexpr (decide) { return true; }
296 else {
298 file_could_not_be_opened(treebank_filename),
300 });
301 }
302 }
303
306
307 std::string current_line;
308
309 std::size_t line = 1;
310 while (getline(fin, current_line)) {
311 if (current_line == "") {
312 // do nothing
313 }
314 else {
315 auto r = find_errors<decide>(current_line);
316 if constexpr (decide) {
317 // if there are errors, return
318 if (r) { return true; }
319 }
320 else {
321 // append errors to the list
322 for (io::head_vector_error& err : r) {
323 report.add_error(line, std::move(err));
324 }
325 }
326 }
327
328 ++line;
329 }
330
331 if constexpr (decide) { return false; }
332 else { return report; }
333}
334
342template <bool decide>
343[[nodiscard]] std::conditional_t<decide, bool, io::treebank_collection_report>
345(const std::string& main_file_name, const std::size_t n_threads)
346noexcept
347{
348 if (not std::filesystem::exists(main_file_name)) {
349 if constexpr (decide) { return true; }
350 else {
352 file_does_not_exist(main_file_name),
354 });
355 }
356 }
357 std::ifstream fin_main_file(main_file_name);
358 if (not fin_main_file.is_open()) {
359 if constexpr (decide) { return true; }
360 else {
362 file_could_not_be_opened(main_file_name),
364 });
365 }
366 }
367
370 char errors_found = 0;
371
372 #pragma omp parallel num_threads(n_threads) shared(errors_found)
373 {
374
375 const int tid = omp_get_thread_num();
376 if (tid == 0) {
377
378 std::size_t main_file_line = 1;
379 std::string id, treebankname;
380
381 while (fin_main_file >> id >> treebankname and errors_found == 0) {
382 // make full path to the treebank
383 std::filesystem::path treebank_full_path(main_file_name);
384 treebank_full_path.replace_filename(treebankname);
385 const std::string full_path_as_string = treebank_full_path.string();
386
387 #pragma omp task
388 {
389
390 if (errors_found == 0) {
391
392 // check correctess of treebank
393 auto r = check_correctness_treebank<decide>(full_path_as_string);
394
395 if constexpr (decide) {
396 if (r) {
397 #pragma omp critical
398 errors_found = 1;
399 }
400 }
401 else {
402 // append errors found in the treebank to
403 // the list of errors of this dataset
404 if (r.get_num_errors() > 0) {
405 #pragma omp critical
406 {
407 report.add_report(
408 main_file_line,
409 std::move(treebankname),
410 std::move(id),
411 std::move(r)
412 );
413 }
414 }
415 }
416 }
417 }
418 ++main_file_line;
419 }
420
421 }
422 }
423
424 if constexpr (decide) {
425 return (errors_found == 0 ? false : true);
426 }
427 else { return report; }
428}
429
430} // -- namespace detail
431} // -- namespace lal
Head vector error report class.
Definition head_vector_error.hpp:64
Report on a treebank collection.
Definition treebank_collection_report.hpp:67
void set_treebank_error(const treebank_file_error &err) noexcept
Sets the error concerning the main file of the collection.
Definition treebank_collection_report.hpp:136
void add_report(const uint64_t line_number, const std::string &treebank_file_name, const std::string &treebank_id, const treebank_file_report &err) noexcept
Adds a report on a treebank file.
Definition treebank_collection_report.hpp:103
Report on a treebank file.
Definition treebank_file_report.hpp:69
void add_error(const uint64_t line_number, const head_vector_error &err) noexcept
Adds an error to the list of errors.
Definition treebank_file_report.hpp:125
void set_treebank_error(const treebank_file_error &err) noexcept
Sets the treebank error m_treebank_error.
Definition treebank_file_report.hpp:138
std::conditional_t< decide, bool, io::treebank_collection_report > check_correctness_treebank_collection(const std::string &main_file_name, const std::size_t n_threads) noexcept
Find errors in a treebank collection.
Definition check_correctness.hpp:345
std::conditional_t< decide, bool, std::vector< io::head_vector_error > > find_errors(const head_vector &hv) noexcept
Find errors in a head vector.
Definition check_correctness.hpp:109
graph_t from_head_vector_to_graph(const head_vector &hv, const bool normalize, const bool check) noexcept
Transforms a head vector in a directed graph.
Definition conversions.hpp:191
bool has_undirected_cycles(const graph_t &g, BFS< graph_t > &bfs) noexcept
Returns true if, and only if, the graph has UNDIRECTED cycles.
Definition cycles.hpp:138
std::conditional_t< decide, bool, io::treebank_file_report > check_correctness_treebank(const std::string &treebank_filename) noexcept
Find errors in a treebank file.
Definition check_correctness.hpp:280
@ self_loop
The current head index points to itself.
@ wrong_number_of_edges
The graph does not contain enough edges to be a tree.
@ head_out_bounds
The current head index is a valid non-negative integer value, but points outside the head vector.
@ isolated_vertex
There are isolated vertices in the graph.
@ graph_has_cycles
The graph contains an undirected cycle, that is, the graph is not a tree.
@ invalid_integer
The current head index is not a valid non-integer integer number. It could be a letter,...
@ wrong_number_of_roots
The head vector contains too many roots.
@ main_file_could_not_be_opened
Main file could not be opened.
@ treebank_file_does_not_exist
A treebank was not found in disk.
@ treebank_result_file_could_not_be_opened
The resulting file of processing a treebank could not be opened.
@ main_file_does_not_exist
Main file does not exist.
Main namespace of the library.
Definition basic_types.hpp:48
std::vector< uint64_t > head_vector
See Head vector page for further details.
Definition basic_types.hpp:58
uint64_t node
Node type. See Node / Vertex page for further details.
Definition basic_types.hpp:51