bes  Updated for version 3.20.8
EffectiveUrlCache.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of the BES http package, part of the Hyrax data server.
4 
5 // Copyright (c) 2020 OPeNDAP, Inc.
6 // Author: Nathan Potter <ndp@opendap.org>
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //
22 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
23 
24 // Authors:
25 // ndp Nathan Potter <ndp@opendap.org>
26 
27 #include "config.h"
28 
29 #ifdef HAVE_STDLIB_H
30 #include <stdlib.h>
31 #endif
32 
33 #include <pthread.h>
34 
35 #include <sstream>
36 #include <string>
37 
38 #include "EffectiveUrlCache.h"
39 
40 #include "BESSyntaxUserError.h"
41 #include "TheBESKeys.h"
42 #include "BESDebug.h"
43 #include "BESStopWatch.h"
44 #include "BESUtil.h"
45 #include "BESLog.h"
46 #include "CurlUtils.h"
47 #include "HttpNames.h"
48 #include "EffectiveUrl.h"
49 
50 using namespace std;
51 
52 #define MODULE "euc"
53 #define MODULE_DUMPER "euc:dump"
54 #define prolog std::string("EffectiveUrlCache::").append(__func__).append("() - ")
55 
56 namespace http {
57 
58 EffectiveUrlCache *EffectiveUrlCache::d_instance = 0;
59 pthread_once_t EffectiveUrlCache::d_init_control = PTHREAD_ONCE_INIT;
60 
61 EucLock::EucLock(pthread_mutex_t &lock) : m_mutex(lock) {
62  int status = pthread_mutex_lock(&m_mutex);
63  if (status != 0){
64  throw BESInternalError(prolog + "Could not acquire mutex lock.", __FILE__, __LINE__);
65  }
66  BESDEBUG(MODULE,prolog << "Locked. (thread: " << pthread_self() << ")" << endl);
67 }
68 
69 EucLock::~EucLock() {
70  int status = pthread_mutex_unlock(&m_mutex);
71  if (status != 0){
72  ERROR_LOG(prolog + "Failed to release mutex lock.");
73  }
74  BESDEBUG(MODULE,prolog << "Unlocked. (thread: " << pthread_self() << ")" << endl);
75 }
76 
77 
96 EffectiveUrlCache *
97 EffectiveUrlCache::TheCache()
98 {
99  if (d_instance == 0) {
100  pthread_once(&d_init_control,EffectiveUrlCache::initialize_instance);
101  }
102 
103  return d_instance;
104 }
105 
110 void EffectiveUrlCache::initialize_instance()
111 {
112 
113  d_instance = new EffectiveUrlCache;
114 #ifdef HAVE_ATEXIT
115  atexit(delete_instance);
116 #endif
117 
118 }
119 
123 void EffectiveUrlCache::delete_instance()
124 {
125  delete d_instance;
126  d_instance = 0;
127 }
128 
133 EffectiveUrlCache::EffectiveUrlCache(): d_skip_regex(NULL), d_enabled(-1)
134 {
135  if (pthread_mutex_init(&d_get_effective_url_cache_mutex, 0) != 0)
136  throw BESInternalError("Could not initialize mutex in CurlHandlePool", __FILE__, __LINE__);
137 
138 }
139 
144 EffectiveUrlCache::~EffectiveUrlCache()
145 {
146  map<string , http::EffectiveUrl *>::iterator it;
147  for(it = d_effective_urls.begin(); it!= d_effective_urls.end(); it++){
148  delete it->second;
149  }
150  d_effective_urls.clear();
151 
152  if(d_skip_regex){
153  delete d_skip_regex;
154  d_skip_regex = 0;
155  }
156 }
157 
158 
166 void EffectiveUrlCache::dump(ostream &strm) const
167 {
168  strm << BESIndent::LMarg << prolog << "(this: " << (void *) this << ")" << endl;
169  BESIndent::Indent();
170  strm << BESIndent::LMarg << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"WAS NOT SET") << endl;
171  if (!d_effective_urls.empty()) {
172  strm << BESIndent::LMarg << "effective url list:" << endl;
173  BESIndent::Indent();
174  auto it = d_effective_urls.begin();
175  while( it!= d_effective_urls.end()){
176  strm << BESIndent::LMarg << (*it).first << " --> " << (*it).second->str();
177  it++;
178  }
179  BESIndent::UnIndent();
180  }
181  else {
182  strm << BESIndent::LMarg << "effective url list: EMPTY" << endl;
183  }
184  BESIndent::UnIndent();
185 }
186 
195 {
196  stringstream sstrm;
197  dump(sstrm);
198  return sstrm.str();
199 }
200 
201 
206 http::EffectiveUrl *EffectiveUrlCache::get(const std::string &source_url){
207  http::EffectiveUrl *effective_url=NULL;
208  auto it = d_effective_urls.find(source_url);
209  if(it!=d_effective_urls.end()){
210  effective_url = (*it).second;
211  }
212  return effective_url;
213 }
214 
215 
216 //########################################################################################
217 //########################################################################################
218 //########################################################################################
219 
220 
228 string EffectiveUrlCache::get_effective_url(const string &source_url)
229 {
230 
231  // This lock will block until the mutex is available.
232  EucLock dat_lock(this->d_get_effective_url_cache_mutex);
233 
234  BESDEBUG(MODULE, prolog << "BEGIN url: " << source_url << endl);
235  string effective_url_str = source_url;
236 
237  if(is_enabled()){
238 
239  BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
240 
241  size_t match_length=0;
242 
243  // if it's not an HTTP url there is nothing to cache.
244  if (source_url.find("http://") != 0 && source_url.find("https://") != 0) {
245  BESDEBUG(MODULE, prolog << "END Not an HTTP request, SKIPPING." << endl);
246  return effective_url_str;
247  }
248 
249  BESRegex *skip_regex = get_skip_regex();
250  if( skip_regex ) {
251  match_length = skip_regex->match(source_url.c_str(), source_url.length());
252  if (match_length == source_url.length()) {
253  BESDEBUG(MODULE, prolog << "END Candidate url matches the "
254  "no_redirects_regex_pattern [" << skip_regex->pattern() <<
255  "][match_length=" << match_length << "] SKIPPING." << endl);
256  return effective_url_str;
257  }
258  BESDEBUG(MODULE, prolog << "Candidate url: '" << source_url << "' does NOT match the "
259  "skip_regex pattern [" << skip_regex->pattern() << "]" << endl);
260  }
261  else {
262  BESDEBUG(MODULE, prolog << "The cache_effective_urls_skip_regex() was NOT SET "<< endl);
263  }
264 
265  http::EffectiveUrl *effective_url = get(source_url);
266 
267  // See if the data_access_url has already been processed into a terminal URL
268  bool retrieve_and_cache = !effective_url; // If there's no effective_url we gotta go get it.
269  if(effective_url){
270  BESDEBUG(MODULE, prolog << "Cache hit for: " << source_url << endl);
271  retrieve_and_cache = effective_url->is_expired();
272  BESDEBUG(MODULE, prolog << "Cached target URL is " << (retrieve_and_cache?"":"not ") << "expired." << endl);
273  }
274  // It not found or expired, reload.
275  if(retrieve_and_cache){
276  BESDEBUG(MODULE, prolog << "Acquiring effective URL for " << source_url << endl);
277 
278  {
279  BESStopWatch sw;
280  if(BESDebug::IsSet(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY)) sw.start(prolog + "Retrieve and cache effective url for source url: " + source_url);
281  effective_url = curl::retrieve_effective_url(source_url);
282  }
283  BESDEBUG(MODULE, prolog << " source_url: " << source_url << endl);
284  BESDEBUG(MODULE, prolog << "effective_url: " << effective_url->dump() << endl);
285 
286  d_effective_urls[source_url] = effective_url;
287 
288  BESDEBUG(MODULE, prolog << "Updated record for "<< source_url << " cache size: " << d_effective_urls.size() << endl);
289  }
290  effective_url_str = effective_url->str();
291  BESDEBUG(MODULE_DUMPER, prolog << "dump: " << endl << dump() << endl);
292  } // EucLock dat_lock is released when the point of execution reaches this brace and dat_lock goes out of scope.
293  else {
294  BESDEBUG(MODULE, prolog << "CACHE IS DISABLED." << endl);
295  }
296  BESDEBUG(MODULE, prolog << "END" << endl);
297  return effective_url_str;
298 }
299 
300 
305 bool EffectiveUrlCache::is_enabled()
306 {
307  // The first time here, the value of d_enabled is -1. Once we check for it in TheBESKeys
308  // The value will be 0 (false) or 1 (true) and TheBESKeys will not be checked again.
309  if(d_enabled < 0){
310  bool found;
311  string value;
312  TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_KEY,value,found);
313  BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_KEY <<": '" << value << "'" << endl);
314  d_enabled = found && BESUtil::lowercase(value)=="true";
315  }
316  BESDEBUG(MODULE, prolog << "d_enabled: " << (d_enabled?"true":"false") << endl);
317  return d_enabled;
318 }
319 
324 BESRegex *EffectiveUrlCache::get_skip_regex()
325 {
326  if(!d_skip_regex){
327  bool found;
328  string value;
329  TheBESKeys::TheKeys()->get_value(HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY, value, found);
330  if(found && value.length()){
331  BESDEBUG(MODULE, prolog << HTTP_CACHE_EFFECTIVE_URLS_SKIP_REGEX_KEY <<": " << value << endl);
332  d_skip_regex = new BESRegex(value.c_str());
333  }
334  }
335  BESDEBUG(MODULE, prolog << "d_skip_regex: " << (d_skip_regex?d_skip_regex->pattern():"Value has not been set.") << endl);
336  return d_skip_regex;
337 }
338 
339 
340 
341 
342 
343 } // namespace http
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:160
exception thrown if internal error encountered
int match(const char *s, int len, int pos=0)
Does the pattern match.
Definition: BESRegex.cc:107
virtual bool start(std::string name)
Definition: BESStopWatch.cc:67
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
Definition: TheBESKeys.cc:339
static TheBESKeys * TheKeys()
Definition: TheBESKeys.cc:71
std::string get_effective_url(const std::string &source_url)
virtual std::string dump() const
dumps information about this object
std::string dump() override
A string dump of the instance.
bool is_expired() override
Returns true if URL is reusable, false otherwise.
Definition: EffectiveUrl.cc:67
utility class for the HTTP catalog module
Definition: EffectiveUrl.cc:58