OpenZGY/C++ API and Internals (ALPHA)
Access seismic data stored in ZGY format.
histogrambuilder.h
Go to the documentation of this file.
1 // Copyright 2017-2020, Schlumberger
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //Adapted from: Zgy/UtilityLib/HistogramBuilder
16 #pragma once
17 
18 #include "../declspec.h"
19 #include "statisticdata.h" // needed due to data members.
20 #include "histogramdata.h" // needed due to data members.
21 
22 #include <cmath> // needed for std::isfinite
23 
24 #include <limits>
25 #include <iterator> // needed for std::iterator_traits
26 #include <assert.h>
27 #ifdef LINUX
28 #include <stdint.h> // for intptr_t
29 #endif
30 
31 namespace InternalZGY {
32 #if 0
33 }
34 #endif
35 
51 class OPENZGY_TEST_API HistogramBuilder
52 {
53 public:
54 
55  typedef int size_type;
56  typedef StatisticData::count_type count_type;
57 
58  HistogramBuilder(size_type _nbins, double _min, double _max);
59  HistogramBuilder(const count_type* bins, int nbins, double min, double max, count_type scnt, double ssum, double sssq, double smin, double smax);
60 
61  HistogramBuilder& operator+=(const HistogramBuilder& other);
62  HistogramBuilder& operator-=(const HistogramBuilder& other);
63  HistogramBuilder& operator*=(count_type factor);
64  bool operator==(const HistogramBuilder& other) const;
65  bool operator!=(const HistogramBuilder& other) const;
66 
67  // Queries
68  const StatisticData& getstats() const { return stats_; }
69  const HistogramData& gethisto() const { return histo_; }
70  StatisticData gethiststats() const;
72  // Data collection
73  template <typename It> void add(It begin, It end);
74 
75  // Other operations
76  void scale(double oldmin, double oldmax, double newmin, double newmax);
77 
78 private:
79  void clear();
80  template <typename It> void tryAdd(It begin, It end, StatisticData *localstats);
81  template <typename It> void fastAddSignedChar256(It begin, It end);
82 
83 private:
84  StatisticData stats_;
85  HistogramData histo_;
86 };
87 
92 template <typename It>
93 void HistogramBuilder::fastAddSignedChar256(It begin, It end)
94 {
95  assert(this->gethisto().getsize() == 256 &&
96  std::numeric_limits<typename std::iterator_traits<It>::value_type>::is_integer &&
97  std::numeric_limits<typename std::iterator_traits<It>::value_type>::min()+128 == 0 &&
98  std::numeric_limits<typename std::iterator_traits<It>::value_type>::max() == +127 &&
99  std::numeric_limits<typename std::iterator_traits<It>::value_type>::min() == this->gethisto().getmin() &&
100  std::numeric_limits<typename std::iterator_traits<It>::value_type>::max() == this->gethisto().getmax());
101 
102  // For char input, there is no risk of overflow and no scaling - just an offset.
103  for (It i = begin; i != end; ++i)
104  histo_.addToBinNumberUNSAFE(static_cast<char>(*i) + 128);
105 
106  // stats_ needs to be re-calculated. Assuming that the typical call to add()
107  // contains 256k samples, it is more efficient to do it this way.
108  // For char data (unlike wider types), we will get a precise result.
109  // It would have been even faster to defer this until the stats were
110  // actually requested, but this way is safer and hopefully fast enough.
111  // Note: if I ever rewrite the calling code to parallelize the histogram
112  // generation, the chunks will likely be smaller and in that case something
113  // will need to be done to avoid calculating stats too often.
114  stats_ = StatisticData(gethisto().getbins(), gethisto().getsize(), gethisto().getmin(), gethisto().getmax(), true);
115 }
116 
122 template <typename It>
123 void HistogramBuilder::add(It begin, It end)
124 {
125  // verify that we need to do anything
126  if (begin == end) {
127  return;
128  }
129 
130  // Short cuts for simple cases that can be handled more efficiently.
131  if (std::numeric_limits<typename std::iterator_traits<It>::value_type>::is_integer &&
132  std::numeric_limits<typename std::iterator_traits<It>::value_type>::min() == this->gethisto().getmin() &&
133  std::numeric_limits<typename std::iterator_traits<It>::value_type>::max() == this->gethisto().getmax())
134  {
135  if (this->gethisto().getsize() == 256 &&
136  std::numeric_limits<typename std::iterator_traits<It>::value_type>::min()+128 == 0 &&
137  std::numeric_limits<typename std::iterator_traits<It>::value_type>::max() == +127)
138  {
139  // This can be dramatically (10x) faster, since we don't need to increment statistics for each sample.
140  fastAddSignedChar256(begin, end);
141  return;
142  }
143  }
144 
145  // Calculate statistics and histogram information in a single pass.
146  StatisticData tempstats;
147  tryAdd<It>(begin, end, &tempstats);
148  stats_ += tempstats;
149 }
150 
155 template <typename It>
156 void HistogramBuilder::tryAdd(It begin, It end, StatisticData *localstats)
157 {
158  double combo_offset = 0, combo_scale = 1; // from storage value to bin number.
159 
160  // conversion from coding range to bin number.
161  histo_.calculateConversionFactors(&combo_offset, &combo_scale);
162 
163  // Performance note: If I add +1.5 to offset here, then instead of the
164  // RoundD2I later (round to nearest 0-based bin index) I would instead be doing
165  // a simple assign from float to int, rounding towards zero, to get as a result
166  // a 1-based bin index. Then subtract 1 to get the 0-based bin index.
167  // This is more efficient on x64 and when /arch:sse2 is given, but less
168  // efficient in plain x86 mode. The reason I wouldn't just add 0.5 is that
169  // it would not handle the range [-1, -0.5] correctly as they would map to
170  // bin 0 and not an invalid bin number.
171 
172  // More on the topic of rounding: If the value is so huge that it won't fit
173  // in a 32-bit integer then the RoundD2I result is technically undefined.
174  // I am assuming that the result won't fall in the range of valid bin numbers,
175  // and that *appears* to be true on the IA32 architecture. But the assumption
176  // is fragile. "n" must not be declared as unsigned; we need to cast it.
177  // See HardwareOptimized.h for a longer discussion.
178 
179  // To avoid having to iterate twice over the input, both the statistics
180  // and the histogram information will be updated in the same loop.
181  // Furthermore, the statistics will be collected in temporary variables
182  // so the compiler has a chance of placing them in registers.
183  // Yes, this is a bit messy because it muddles the separation of
184  // Histogram and Statistics, and it duplicates logic from the latter.
185  std::int64_t cnt = 0;
186  std::int64_t inf = 0;
187  double sum = 0;
188  double ssq = 0;
189  double min = std::numeric_limits<double>::max();
190  double max = -std::numeric_limits<double>::max();
191 
192  for (It i = begin; i != end; ++i) {
193  register double value = *i;
194 
195  // The test for std::isfinite is not needed if *i is an integral type.
196  if (std::numeric_limits<typename std::iterator_traits<It>::value_type>::is_integer || std::isfinite(value)) {
197 
198  int n = RoundD2I(combo_offset + combo_scale * value);
199  if (static_cast<unsigned int>(n) < static_cast<unsigned int>(gethisto().getsize()))
200  histo_.addToBinNumberUNSAFE(n);
201 
202  // Always accumulate statistics. localstats is almost always != 0.
203  sum += value;
204  ssq += value*value;
205  if (min > value)
206  min = value;
207  if (max < value)
208  max = value;
209  ++cnt;
210  } else {
211  ++inf;
212  }
213  }
214 
215  if (localstats != NULL) {
216  *localstats = StatisticData(cnt, inf, sum, ssq, min, max);
217  }
218 }
219 
220 } // end namespace
InternalZGY::HistogramData
A histogram for a data set.
Definition: histogramdata.h:75
statisticdata.h
provides class InternalZGY::StatisticData.
InternalZGY::StatisticData
Holds the result of computing statistics.
Definition: statisticdata.h:37
InternalZGY::HistogramBuilder
Collect statistics and histogram for bulk data.
Definition: histogrambuilder.h:51
histogramdata.h
provides class InternalZGY::HistogramData.
InternalZGY
Implementation not visible to clients.