Wednesday, April 2, 2014

Java frameworks, encapsulation and code bloat, i.e., the javabean specification failed us!

I have recently run into the following error in my JAX-RS application running in Jersey 2.6 (on Tomcat 6):

 Failed to generate the schema for the JAX-B elements  
 com.sun.xml.internal.bind.v2.runtime.IllegalAnnotationsException: 1 counts of IllegalAnnotationExceptions  
 com.tinypass.rest.v3.framework.dto.RestDTOFromEntity does not have a no-arg default constructor.  
   this problem is related to the following location:  
    at com.tinypass.rest.v3.framework.dto.RestDTOFromEntity  
    at com.tinypass.rest.v3.dto.AppDTO  
    at com.tinypass.rest.v3.dto.AppSensitiveDTO  
   at com.sun.xml.internal.bind.v2.runtime.IllegalAnnotationsException$Builder.check(IllegalAnnotationsException.java:91)  
   at com.sun.xml.internal.bind.v2.runtime.JAXBContextImpl.getTypeInfoSet(JAXBContextImpl.java:436)  
   at com.sun.xml.internal.bind.v2.runtime.JAXBContextImpl.(JAXBContextImpl.java:277)  
   at com.sun.xml.internal.bind.v2.runtime.JAXBContextImpl$JAXBContextBuilder.build(JAXBContextImpl.java:1100)  
   at com.sun.xml.internal.bind.v2.ContextFactory.createContext(ContextFactory.java:143)  
   at com.sun.xml.internal.bind.v2.ContextFactory.createContext(ContextFactory.java:110)  
   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)  
   at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)  
   at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)  
   at java.lang.reflect.Method.invoke(Method.java:597)  
   at javax.xml.bind.ContextFinder.newInstance(ContextFinder.java:202)  
   at javax.xml.bind.ContextFinder.find(ContextFinder.java:376)  
   at javax.xml.bind.JAXBContext.newInstance(JAXBContext.java:574)  
   at javax.xml.bind.JAXBContext.newInstance(JAXBContext.java:522)  
   at org.glassfish.jersey.server.wadl.internal.generators.WadlGeneratorJAXBGrammarGenerator.buildModelAndSchemas(WadlGeneratorJAXBGrammarGenerator.java:368)  
   at org.glassfish.jersey.server.wadl.internal.generators.WadlGeneratorJAXBGrammarGenerator.createExternalGrammar(WadlGeneratorJAXBGrammarGenerator.java:317)  
   at org.glassfish.jersey.server.wadl.internal.WadlBuilder.generate(WadlBuilder.java:121)  
   at org.glassfish.jersey.server.wadl.internal.WadlApplicationContextImpl.getApplication(WadlApplicationContextImpl.java:143)  
   at org.glassfish.jersey.server.wadl.internal.WadlApplicationContextImpl.getApplication(WadlApplicationContextImpl.java:162)  
   at org.glassfish.jersey.server.wadl.processor.WadlModelProcessor$OptionsHandler.apply(WadlModelProcessor.java:138)  
   at org.glassfish.jersey.server.wadl.processor.WadlModelProcessor$OptionsHandler.apply(WadlModelProcessor.java:120)  
   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)  
   at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)  
   at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)  
   at java.lang.reflect.Method.invoke(Method.java:597)  
   at org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory$1.invoke(ResourceMethodInvocationHandlerFactory.java:81)  
   at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:151)  
   at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:171)  
   at org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:152)  
   at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:104)  
   at org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:406)  
   at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:350)  
   at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:106)  
   at org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:259)  
   at org.glassfish.jersey.internal.Errors$1.call(Errors.java:271)  
   at org.glassfish.jersey.internal.Errors$1.call(Errors.java:267)  
   at org.glassfish.jersey.internal.Errors.process(Errors.java:315)  
   at org.glassfish.jersey.internal.Errors.process(Errors.java:297)  
   at org.glassfish.jersey.internal.Errors.process(Errors.java:267)  
   at org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:319)  
   at org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:236)  
   at org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:1028)  
   at org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:373)  
   at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:381)  
   at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:344)  
   at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:219)  
   at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)  
   at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)  
   at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)  
   at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)  
   at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)  
   at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)  
   at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)  
   at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:298)  
   at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)  
   at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:588)  
   at org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)  
   at java.lang.Thread.run(Thread.java:662)  


It looks like this is happening when the user tries to retrieve the application.wadl file. This error occurs because the class RestDTOFromEntity does not have a default constructor. For some reason JAXB requires this no-arg constructor to generate the WADL file. This can be fixed by adding a private no-arg constructor to RestDTOFromEntity.

The absence of a default constructor for that class is motivated by design choices made to properly encapsulate the contents of this class. It's a safe way to force programmers to provide the parameters required to the constructor (which will appropriately throw in case of nulls etc.). A non default constructor would be required if the class was immutable (which is not in this case). This and other encapsulation choices are very common in "academic" object oriented design.

In this specific example, the solution does not break the encapsulation of the class, however it does increase the code bloat with a private no-arg constructor that sits there for no apparent reason (and I believe there might be instances in which framework support requires breaking encapsulation or other OOP principles).

I feel like most Java frameworks built on the javabean specification end up running into this kind of problems. Here are some examples:
  • It's common practice in Spring to have default constructors and setters: this is terrible OOP design that leaves the object to all kinds of potential bugs; this can be fixed by using constructor injection but nobody does that because it's a feature added later and because everyone is so used to the javabean paradigm.
  • Again, in Spring, it's common practice to instantiate beans with "singleton" scope: this "singleton" is usually an instance of a class; if by accident your application starts with two application contexts, you end up with two singletons! This problem can be solved by using an actual singleton and a factory method, e.g., `getInstance()`, to register the singleton with the Spring context. The method could also take care of DI, but again, this is an even more obscure feature of Spring that nobody uses.
  • Objects serialized/deserialized using Jackson typically require a default constructor unless a constructor is defined as a JsonCreator. Once again, the javabean paradigm is so widely spread that many java programmers look at code like that and think I'm insane. Besides, the JsonCreator constructor usually looks pretty terrible and requires to define the properties name in two different places (code bloat + duplication)!
In my experience with (legacy) Java web apps, it looks like these problems are compounded by generally poor design practices, once again fostered by the javabean model: poor encapsulation and lack of immutability. This, together with layer violation in monolithic java apps (like a Spring app would be, ironically!), like using ORM objects in frontend code, leads to unmanageable and fragile code.

Some could argue that being diligent and methodical can prevent programmers from falling into bad practices. However, in my experience this also means swimming against the current because following the right path sometimes leads to horrible code bloat ("what's that giant constructor with all the properties? use setters!") in the best case scenarios and straight up bugs (as the one above) in the worst case scenarios. Taking the easy way out is then justified by the mere existence of the javabean paradigm: every class becomes mutable, with every property (be it a simple type or an object) effectively escaping encapsulation and thread safety (which is "fixed" by using heavy synchronized blocks). So in one single move, the javabean specification flushes down the toilet all the "academic" 101 design practices and makes our life so much harder in the long run.

Javabeans you have failed me!

Monday, January 6, 2014

ORM hierarchy design in Scala

I recently started working heavily with Scala. Scala is a really expressive language based on both object oriented and functional programming paradigms. I often think of it as the 21st century evolution of Java.

One of the tasks I have been busy with is designing a hierarchy of ORM in Scala to interface with a data store. The nature of the data store is irrelevant. It can be either a relational database or a no-SQL solution. The issues that I am facing are the same for either solution.

The requirements of my ORM hierarchy are simple:
  1. Each entity must be modeled by an immutable and final class;
  2. Each entity should be easily convertible to a map from field name to field value;
  3. Each entity should contain a "static" field indicating the table name;
  4. It would be nice to have the ability to pattern-match on the entities.
Having immutable objects is one of the staples of Scala programming and it makes a lot of sense to have this as a requirement for a Scala based ORM entity. Immutable objects make it much easier to reason about code and ensure thread safety. On top of that, having final entities ensures additional safety.

Being able to convert an entity to a map from field names and field values makes it easy to write methods to store the entity in a denormalized fashion, i.e., using a No-SQL data store such as MongoDB or Riak. Moreover, as I will show in the following examples, it will simplify the code quite a lot.

The perfect candidates for ORM entities in Scala are case classes. Case classes are immutable, allow pattern matching, provide compiler generated accessors and compiler generated hashcode and equals method, which reduce the amount of boilerplate code and thus of error proneness, and not least, are serializable out of the box. The only tricky thing with case classes is the conversion to/from a map as per requirement 2. This can be done using reflection or using macros but it's not trivial.

There is however one big limitations to case classes which gets them out of the picture. The number of fields that a case class can contain is limited to 22. This is a design limitation that will be lifted with by Scala 2.11. However, before the 2.11 milestone releases are not binary compatible with 2.10, thus requiring all the dependent libraries to be recompiled for 2.11. This might not be an easy task for most. Although many would argue that this limitation can be worked around using nested field, most ORM designers know that it's very easy to have wide denormalized tables with more than 22 fields, especially in legacy systems.

Mainly for this reason and for requirement 2, I came up with a different solution to this problem. Let's assume to have a couple of entities as in this code snippet:

 sealed trait GenericEntity {  
  val id: Option[String]  
  val tableName: String  
  val fields: Map[String, Any]  
 }  
 sealed trait FirstEntity extends GenericEntity {  
  val tableName = "first"  
  val x: String  
  val y: Option[String]  
  val z: Option[Int]  
 }  
 sealed trait SecondEntity extends GenericEntity {  
  val tableName = "second"  
  val w: Option[FirstEntity]  
  val t: Float  
  val i: Option[String]  
 }  

The entities are modeled as sealed traits and define the methods as per requirements 2 and 3. The traits are sealed so that the actual implementation can only be defined in the same file (enforcing requirement 1).

Now to the implementation of the entities. I will define a companion object for the two entities in which the apply method is responsible to create concrete instances of the traits. The unapply method allows for pattern matching although that will only work for entities with less than 22 fields. Here's the implementation of the two companion objects:

  object FirstEntity {  
   def apply(id: Option[String], x: String, y: Option[String] = None, z: Option[Int] = None): FirstEntity = new FirstEntityImpl(id, x, y, z)  
   def unapply(f: FirstEntity) = Some((f.id, f.x, f.y, f.z))  
   private case class FirstEntityImpl(id: Option[String] = None, fields: Map[String, Any]) extends FirstEntity {  
    def this(id: Option[String], x: String, y: Option[String], z: Option[Int]) =  
     this(id, {  
      val m = collection.mutable.Map[String, Any]()  
      m("x") = x  
      addTo(m, "y", y)  
      addTo(m, "z", z)  
      Map() ++ m.toMap  
     })  
    val x: String = fields("x").asInstanceOf[String]  
    val y: Option[String] = fields.get("y").asInstanceOf[Option[String]]  
    val z: Option[Int] = fields.get("z").asInstanceOf[Option[Int]]  
   }  
  }  
  object SecondEntity {  
   def apply(id: Option[String], w: Option[FirstEntity] = None, t: Float, i: Option[String] = None): SecondEntity = new SecondEntityImpl(id, w, t, i)  
   private case class SecondEntityImpl(id: Option[String] = None, fields: Map[String, Any]) extends SecondEntity {  
    def this(id: Option[String], w: Option[FirstEntity], t: Float, i: Option[String]) =  
     this(id, {  
      val m = collection.mutable.Map[String, Any]()  
      m("t") = t  
      addTo(m, "w", w)  
      addTo(m, "i", i)  
      Map() ++ m.toMap  
     })  
    val w: Option[FirstEntity] = fields.get("w").asInstanceOf[Option[FirstEntity]]  
    val t: Float = fields("t").asInstanceOf[Float]  
    val i: Option[String] = fields.get("i").asInstanceOf[Option[String]]  
   }  
  }  
  private def addTo(map: collection.mutable.Map[String, Any], k: String, v: Option[Any]) = v.foreach(map(k) = _)  

The companion objects provide the concrete implementation of the entities. This is a simple case class which defines the entity id and a map of fields. The id field is there just for convenience but could be kept in the map itself instead (or be absent). The immutable map representing the whole entity is created at construction time. The entity fields (optional or mandatory) are then extracted from the map. One of the flaws with this design is that a cast is necessary at object creation. Another (minor) flaw is that the field keys show up twice in the code (this could be changed by defining them as private strings, leading to a little more safety although a little more verbose). With some leg work the apply/unapply methods could be automatically generated using a macro on the trait and a map from val name to field name.

Here's a full example of how to use these objects:

 object Main extends App {  
  val f: FirstEntity = FirstEntity(id = Some("123"), x = "asdf", z = Some(5324))  
  val g: FirstEntity = FirstEntity(id = Some("123"), x = "asdf", z = Some(5324))  
  assert(f == g)  
  assert(f.id.get == "123")  
  assert(f.x == "asdf")  
  assert(f.y.isEmpty)  
  assert(f.z.get == 5324)  
  assert(f.fields == Map("x" -> "asdf", "z" -> 5324))  
  val x: SecondEntity = SecondEntity(id = None, w = Some(f), t = 123.232f, i = Some("blah"))  
  // this line will cause: recursive value f needs type  
  // val z = SecondEntity(id = None, w = Some(f), t = 123.232f, i = Some("blah"))  
  val y: SecondEntity = SecondEntity(id = None, w = Some(g), t = 123.232f, i = Some("blah"))  
  assert(x == y)  
  assert(x.fields == Map("w" -> f, "t" -> 123.232f, "i" -> "blah"))  
  doSomething(f)  
  def doSomething(f: FirstEntity) {  
   f match {  
    case FirstEntity(Some(id), xx, Some(yy), Some(zz)) => println(s"$id, $xx, $yy, $zz")  
    case FirstEntity(Some(id), xx, None, Some(zz)) => println(s"$id, $xx, $zz")  
    case _ => println("not matched")  
   }  
  }  
  sealed trait GenericEntity {  
   val id: Option[String]  
   val tableName: String  
   val fields: Map[String, Any]  
  }  
  sealed trait FirstEntity extends GenericEntity {  
   val tableName = "first"  
   val x: String  
   val y: Option[String]  
   val z: Option[Int]  
  }  
  sealed trait SecondEntity extends GenericEntity {  
   val tableName = "second"  
   val w: Option[FirstEntity]  
   val t: Float  
   val i: Option[String]  
  }  
  object FirstEntity {  
   def apply(id: Option[String], x: String, y: Option[String] = None, z: Option[Int] = None): FirstEntity = new FirstEntityImpl(id, x, y, z)  
   def unapply(f: FirstEntity) = Some((f.id, f.x, f.y, f.z))  
   private case class FirstEntityImpl(id: Option[String] = None, fields: Map[String, Any]) extends FirstEntity {  
    def this(id: Option[String], x: String, y: Option[String], z: Option[Int]) =  
     this(id, {  
      val m = collection.mutable.Map[String, Any]()  
      m("x") = x  
      addTo(m, "y", y)  
      addTo(m, "z", z)  
      Map() ++ m.toMap  
     })  
    val x: String = fields("x").asInstanceOf[String]  
    val y: Option[String] = fields.get("y").asInstanceOf[Option[String]]  
    val z: Option[Int] = fields.get("z").asInstanceOf[Option[Int]]  
   }  
  }  
  object SecondEntity {  
   def apply(id: Option[String], w: Option[FirstEntity] = None, t: Float, i: Option[String] = None): SecondEntity = new SecondEntityImpl(id, w, t, i)  
   private case class SecondEntityImpl(id: Option[String] = None, fields: Map[String, Any]) extends SecondEntity {  
    def this(id: Option[String], w: Option[FirstEntity], t: Float, i: Option[String]) =  
     this(id, {  
      val m = collection.mutable.Map[String, Any]()  
      m("t") = t  
      addTo(m, "w", w)  
      addTo(m, "i", i)  
      Map() ++ m.toMap  
     })  
    val w: Option[FirstEntity] = fields.get("w").asInstanceOf[Option[FirstEntity]]  
    val t: Float = fields("t").asInstanceOf[Float]  
    val i: Option[String] = fields.get("i").asInstanceOf[Option[String]]  
   }  
  }  
  private def addTo(map: collection.mutable.Map[String, Any], k: String, v: Option[Any]) = v.foreach(map(k) = _)  
 }  

Using the companion object is convenient since it makes the pattern matching syntax more intuitive. It's possible to maintain the same syntax and have many different implementations of the same ORM hierarchies in the same file by encapsulating "pseudo" companion objects in their own ORM object container as in:

 // sealed trait classes definitions...  
 object RDBMSEntities {  
  object FirstEntity {  
   // apply function/case class for RDBMS  
  }  
  object SecondEntity {  
   // apply function/case class for RDBMS  
  }  
 }  
 object NoSQLEntities {  
  object FirstEntity {  
   // apply function/case class for NoSQL DB  
  }  
  object SecondEntity {  
   // apply function/case class for NoSQL DB  
  }  
 }  

And then the different implementations can be used by importing one or the other implementation, i.e., import RDBMSEntities._ or import NoSQLEntities._.

To summarize, while we wait for Scala 2.11 to be released and be popular enough to have most third party libraries compiled against it, the approach described can be a suitable solution to ORM design using Scala. More work is required to create macros that write a lot of boilerplate code for us, but the code in this post is a start. Follow lists of pros and cons.

Pros:
  • all entities are traits -> additional logic/mixins possible in the implementation;
  • compiler-generated equals and hashCode methods -> less boilerplate/errors;
  • pattern matching is possible (with less than 22 fields) -> nice to have;
  • immutable entities (case classes/immutable map) -> more safety;
  • the map can be used to easily store the entities in denormalized form, e.g., in MongoDB or RDBMS -> less boilerplate/errors, no need for annotations;
  • the object corresponding to each trait might be generated using a Scala macro (TODO!) -> less boilerplate/errors;
  • all entities are in fact sealed and can't be extended -> more security.
Cons
  • need to write: apply (and unapply), accessors in case class, additional constructor -> more boilerplate (unless macro-generated);
  • pattern matching impossible for entities with more than 22 fields -> not a deal breaker;
  • requires one cast per accessor in the case class creation -> more boilerplate/less safety.

Tuesday, November 26, 2013

Fun with Guava's ListeningExecutorService

As a follow up to my previous blog post, I decided to rewrite the code sample using the more advance Google Guava ListeningExecutorService and ListenableFuture API, so here it is:

 import com.google.common.util.concurrent.Futures;  
 import com.google.common.util.concurrent.ListenableFuture;  
 import com.google.common.util.concurrent.ListeningExecutorService;  
 import com.google.common.util.concurrent.MoreExecutors;  
 import java.util.ArrayList;  
 import java.util.List;  
 import java.util.Random;  
 import java.util.Scanner;  
 import java.util.concurrent.Callable;  
 import java.util.concurrent.ExecutionException;  
 import java.util.concurrent.Executors;  

 class ListenableFutureExample {  
   public static void main(String[] args) {  
     Scanner in = new Scanner(System.in);  
     final int nThreads = in.nextInt();  
     final int n = in.nextInt();  
     System.out.println("Using " + nThreads + " threads");  
     ListeningExecutorService service = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(nThreads));  
     try {  
       try {  
         testSomeWorkers(service, n);  
       } catch (InterruptedException | ExecutionException e) {  
         e.printStackTrace();  
       }  
       try {  
         testJobCanceling(service, n);  
       } catch (InterruptedException | ExecutionException e) {  
         e.printStackTrace();  
       }  
     } finally {  
       // necessary or the thread pool will keep the JVM up and running!  
       service.shutdown();  
     }  
   }  

   public static void testSomeWorkers(ListeningExecutorService service, int n) throws InterruptedException, ExecutionException {  
     // using the Guava's utility method allAsList will return all the results in a future list  
     // enormously simplifying the code:  
     ListenableFuture<List<Integer>> ret = Futures.successfulAsList(addSomeTasks(service, n));  
     // the call to get() is now the blocking piece of code  
     System.out.println("Values returned from computations: " + ret.get());  
     System.out.println("All done.");  
   }  

   public static void testJobCanceling(ListeningExecutorService service, int n) throws InterruptedException, ExecutionException {  
     List<ListenableFuture<Integer>> tasks = addSomeTasks(service, n);  
     ListenableFuture<List<Integer>> ret = Futures.allAsList(tasks);  
     Thread.sleep(1000);  
     System.out.println("Actually nevermind!");  
     ret.cancel(true);  
     // let's see how many tasks were actually cancelled by asking the original futures:  
     List<Integer> completed = new ArrayList<>();  
     for (ListenableFuture<Integer> f : tasks) if (!f.isCancelled()) completed.add(f.get());  
     System.out.println("There were " + (n - completed.size()) + " cancelled tasks and " + completed.size() + " completed tasks: " + completed);  
     System.out.println("All done.");  
   }  

   private static List<ListenableFuture<Integer>> addSomeTasks(ListeningExecutorService service, int howMany) {  
     System.out.println("Enqueuing " + howMany + " tasks...");  
     List<ListenableFuture<Integer>> ret = new ArrayList<>();  
     for (int i = 1; i <= howMany; i++) {  
       final int n = i;  
       ret.add(service.submit(new Callable<Integer>() {  
         @Override  
         public Integer call() {  
           try {  
             try {  
               System.out.println("Task " + n + ": Doing some very important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing very important work");  
               return null;  
             }  
             try {  
               System.out.println("Task " + n + ": Doing more important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing important work");  
               return null;  
             }  
             try {  
               System.out.println("Task " + n + ": Doing slightly less important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing slightly less important work");  
               return null;  
             }  
             int ret = rnd.nextInt();  
             System.out.println("Task " + n + ": about to return " + ret);  
             return ret;  
           } finally {  
             System.out.println("Task " + n + ": cleaning up");  
           }  
         }  
       }));  
     }  
     return ret;  
   }  
   private final static Random rnd = new Random();  
 }  


The Guava API has the advantage to allow one to register a callback to a ListenableFuture and apply transformations to futures, resulting in a chain of non-blocking operations, very much like Scala's Futures. Non-blocking concurrency will greatly limit resource usage, if used properly since there will be no idle threads blocking while waiting on the results of other threads' computations.

Just for illustration purposes, an example of using Guava's future transformation capabilities to implement fully non-blocking asynchronous computations follows. Enjoy!


 import com.google.common.base.Function;  
 import com.google.common.base.Optional;  
 import com.google.common.util.concurrent.Futures;  
 import com.google.common.util.concurrent.ListenableFuture;  
 import com.google.common.util.concurrent.ListeningExecutorService;  
 import com.google.common.util.concurrent.MoreExecutors;  
 import java.math.BigInteger;  
 import java.util.ArrayList;  
 import java.util.List;  
 import java.util.Random;  
 import java.util.concurrent.Callable;  
 import java.util.concurrent.ExecutionException;  
 import java.util.concurrent.Executors;  

 class ListenableFutureChain {  
   public static void main(String[] args) {  
     ListenableFutureChain chain = new ListenableFutureChain(4);  
     // let's find a few 512 bit prime numbers for our awesome encryption algorithm!  
     ListenableFuture<List<BigInteger>> probablePrimes = chain.findSomePrimeNumbers(20, 512);  
     // now finally do something with the future prime list  
     try {  
       // WARNING: this call is blocking, for illustration purposes only.  
       // It's recommended to design so that you don't need to do this,  
       // as in the function findSomePrimeNumbers()  
       for (BigInteger i : probablePrimes.get()) System.out.println(i);  
     } catch (InterruptedException | ExecutionException e) {  
       e.printStackTrace();  
     }  
     // remember to call this or the executor service will keep the JVM "awake"!  
     chain.dispose();  
   }  

   public ListenableFutureChain(final int nThreads) {  
     executorService = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(nThreads));  
   }  

   public void dispose() {  
     executorService.shutdown();  
   }  

   private final ListeningExecutorService executorService;  
   private final Random random = new Random();  

   public ListenableFuture<List<BigInteger>> findSomePrimeNumbers(final int nAttempts, final int nBits) {  
     List<ListenableFuture<Optional<BigInteger>>> probablePrimes = new ArrayList<>(nAttempts);  
     for (int i = 0; i < nAttempts; i++) {  
       // submit a task for execution and retrieve the ListenableFuture  
       ListenableFuture<BigInteger> probablePrimeFuture = executorService.submit(new Callable<BigInteger>() {  
         @Override  
         public BigInteger call() throws Exception {  
           // I'm going to find a probable prime number  
           return BigInteger.probablePrime(nBits, random);  
         }  
       });  
       // transform the previous ListenableFuture using a function; returns another ListenableFuture (non blocking operation)  
       ListenableFuture<Optional<BigInteger>> primeOrNot = Futures.transform(probablePrimeFuture, new Function<BigInteger, Optional<BigInteger>>() {  
         @Override  
         public Optional<BigInteger> apply(BigInteger p) {  
           // I'm going to return only the probable primes that are actually prime  
           if (isPrime(p)) return Optional.of(p);  
           return Optional.absent();  
         }  
       }, executorService);  
       // add the second future to a list  
       probablePrimes.add(primeOrNot);  
     }  
     // transform the list of futures to a future of list (Guava magic!), only retain successful futures (again, non blocking!)  
     ListenableFuture<List<Optional<BigInteger>>> primesOrNoValues = Futures.successfulAsList(probablePrimes);  
     // transform the future list to a future list containing only the prime numbers in question and return this future (still non blocking)  
     return Futures.transform(primesOrNoValues, new Function<List<Optional<BigInteger>>, List<BigInteger>>() {  
       @Override  
       public List<BigInteger> apply(List<Optional<BigInteger>> primes) {  
         List<BigInteger> ret = new ArrayList<>(primes.size());  
         //  
         for (Optional<BigInteger> optional : primes)  
           if (optional != null && optional.isPresent()) ret.add(optional.get());  
         return ret;  
       }  
     }, executorService);  
     // Note that this whole function is non blocking; you can tell by the fact that there's no InterruptedException being thrown anywhere.  
   }  

   private boolean isPrime(final BigInteger p) {  
     // TODO: do some fancy primality test! (note that this would take a while in real life)  
     // let's just return true or false randomly for now... ;)  
     return random.nextBoolean();  
   }  
 }  

Tuesday, November 19, 2013

Fun with java's ExecutorService

It's surprisingly difficult to find a decent example of the java Executor framework that explains some of the API's "gotchas". It took me a while way back then when I was trying to figure it out, so I decided to post a hopefully useful example. In the code below, I use an executor service to execute some work in parallel. This is the easiest way to exploit simple parallelism in java 7 (java 8 introduces parallel streams which make things even simpler). The executor service is created as a fixed thread pool using a convenient static factory method available in the Executors class. It's very instructive to look at the javadoc for this and other static factory methods available in this class, and even the source code if you are feeling adventurous, to understand how the thread pools are created and what additional options are available to customize them.

The testSomeWorkers() method creates a few tasks (instances of Callable), invokes them using the executor service and retrieves their results using the corresponding Futures. The helper method addSomeTasks() simply calls the submit() method on the executor service to submit the n tasks and retrieve the corresponding future. The future will return the value returned by the Callable once the execution is over.

The testJobCanceling() method works pretty much like testSomeWorkers() but it shows how the tasks can be cancelled by using the Future.cancel() method. The cancel() method's only argument specifies whether or not the thread in which the job is running should be interrupted. In the Callable.call() function, invoking Thread.sleep() can potentially throw an InterruptedException. This will happen when we call cancel() on the future associated with this callable and thus the exception is handled by returning immediately. It's a common idiom in java to handle interruptions in this fashion to achieve some control over concurrent workers.

Enjoy the code:

 import java.util.*;  
 import java.util.concurrent.*;  
 class ExecutorServiceExample {  
   public static void main(String[] args) {  
     Scanner in = new Scanner(System.in);  
     final int nThreads = in.nextInt();  
     final int n = in.nextInt();  
     System.out.println("Using " + nThreads + " threads");  
     ExecutorService service = Executors.newFixedThreadPool(nThreads);  
     try {  
       testSomeWorkers(service, n);  
     } catch (InterruptedException | ExecutionException e) {  
       e.printStackTrace();  
     }  
     try {  
       testJobCanceling(service, n);  
     } catch (InterruptedException | ExecutionException e) {  
       e.printStackTrace();  
     }  
     // necessary or the thread pool will keep the JVM up and running!  
     service.shutdown();  
   }  
   public static void testSomeWorkers(ExecutorService service, int n) throws InterruptedException, ExecutionException {  
     // create and invoke some "tasks" on this executor service  
     Collection<Future<Integer>> taskFutures = addSomeTasks(service, n);  
     System.out.println("Waiting for all tasks to complete...");  
     List<Integer> ret = new ArrayList<>();  
     // retrieve the result of the tasks' computation  
     for (Future<Integer> f : taskFutures) ret.add(f.get());  
     System.out.println("Values returned from computations: " + ret);  
     System.out.println("All done.");  
   }  
   public static void testJobCanceling(ExecutorService service, int n) throws InterruptedException, ExecutionException {  
     Collection<Future<Integer>> taskFutures = addSomeTasks(service, n);  
     Thread.sleep(1000);  
     System.out.println("Actually nevermind!");  
     List<Integer> completed = new ArrayList<>();  
     List<Future<Integer>> cancelled = new ArrayList<>();  
     // try to cancel the tasks that are running  
     for (Future<Integer> f : taskFutures) {  
       // if successfully cancel add to the cancelled list  
       if (f.cancel(true)) cancelled.add(f);  
         // otherwise get the result  
       else completed.add(f.get());  
     }  
     System.out.println("" + cancelled.size() + " tasks were successfully cancelled");  
     if (!completed.isEmpty()) System.out.println("Values returned from computations: " + completed);  
     System.out.println("All done.");  
   }  
   private static Collection<Future<Integer>> addSomeTasks(ExecutorService service, int howMany) {  
     System.out.println("Enqueuing " + howMany + " tasks...");  
     List<Future<Integer>> ret = new ArrayList<>();  
     for (int i = 0; i < howMany; i++) {  
       final int n = i;  
       ret.add(service.submit(new Callable<Integer>() {  
         @Override  
         public Integer call() {  
           try {  
             try {  
               System.out.println("Task " + n + ": Doing some very important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing very important work");  
               return null;  
             }  
             try {  
               System.out.println("Task " + n + ": Doing more important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing important work");  
               return null;  
             }  
             try {  
               System.out.println("Task " + n + ": Doing slightly less important work...");  
               Thread.sleep(200 + rnd.nextInt(200));  
             } catch (InterruptedException e) {  
               System.out.println("Task " + n + " interrupted while doing slightly less important work");  
               return null;  
             }  
             return rnd.nextInt();  
           } finally {  
             System.out.println("Cleaning up after task " + n);  
           }  
         }  
       }));  
     }  
     return ret;  
   }  
   private final static Random rnd = new Random();  
 }  

Tuesday, August 21, 2012

Rabin-Karp Algorithm

Here's a juicy code example implementing the Rabin-Karp algorithm in java. The Rabin-Karp algorithm is a string matching algorithm: given a text and a pattern string, the algorithm will return the location(s) of the given pattern in the text. Assuming the size of the text is n characters and the size of the pattern is m characters the average and best case running time is O(n+m).
The idea is quite simple: treat the pattern p0,m-1 as if it was a number expressed in radix-d notation, i.e., of the form:
P = p0 × dm-1+p1 × dm-2 + ... + pm-1 × d0
Now, a match is found if a substring of the text represents the same exact integer, i.e., the pattern p0,m-1 matches a substring tk,k+m-1 (0 ≤ k ≤ n-m), or equivalently:
P = Tk = tk × dm-1 + tk+1 × dm-2 + ... + tk+m-1 × d0
It's easy to see that the radix-d integer for the substring starting at index location k can be easily computed from the radix-d integer for the substring starting at k-1:
Tk = tk × dm-1 + tk+1 × dm-2 + ... + tk+m-1 × d0 =
     = (tk-1 × dm-1 + tk × dm-2 + ... + tk+m-2 × d0 - tk-1 × dm-1) × d + tk+m-1 =
     = (Tk-1 - tk-1 × dm-1) × d + tk+m-1
That is, subtract the first character of the previous substring multiplied by the radix to the size of the pattern minus one, then multiply by the radix and add the last character of the new substring. It's very straightforward on paper, although it can be tricky to get right.
This approach would certainly work if we could easily store in some integer type the radix-d numbers represented by the strings. Unfortunately, this is not always the case (almost never in fact). Let's look at a simple example: assume your char size is 8 bit, so that you would naturally choose the radix 256=28, since each digit represents a value between 0 and 255=28-1. An m-char string can express numbers between 0 and 256m-1=28×m-1. If we want to store these integers in a 64 bit unsigned long type, we can only represent strings of length 8: 28×8=264. Bottom line: we can't express P and Tk using integers.
This is where the most powerful idea of the Rabin-Karp algorithm comes to play. We don't need to represent these huge numbers, but only their value modulo a big enough prime q, i.e., P mod q and Tk mod q. The catch is that if the match is positive, i.e., the hash codes are identical, we still need to check for equality between the pattern and the substring to make sure it's not a false positive. This obviously increases the computation time if there are a lot of false positives.
One last thing worth noting is that the Rabin-Karp algorithm works very well to match multiple patterns all at once: instead of storing a single key P, we can store a set of keys P1,...,l for each pattern. Great care has to be made in computing the keys Tk now since the patterns can have different lengths, but once one gets the details right, it's quite straightforward.
Follows a simple java implementation of the Rabin-Karp algorithm to find all instances of a single pattern in a text.

Thursday, August 16, 2012

Augmented Data Structures: LRU Cache

New post after a long absence. I have been busy with my job search but it seems to have come to an end, so I have time to write some more and share some code. Here's a post about augmenting data structures. Enjoy!
Augmenting a data structure allows to add additional operations to textbook data structure such as linked lists, trees or hash tables. The basic operations of the data structure should maintain their original asymptotic runtime and the new operations should be efficient. A very good introduction on the topic can be found on the CLR textbook.
This post focuses on a simple data structure that basically mixes a hash table with a linked list. This data structure can be used to implement a simple LRU cache. The idea is to only cache the N most recently used objects and discard the rest, i.e., the least recently used (LRU) ones. The run time complexity for data retrieval from the cache should be constant, i.e., from an object design point of view a cache object of the form Cache could provide a get(Key) method that returns the Data object in O(1) time.
The first idea that comes to mind is to have a hash table do the heavy lifting of storing (Key,Data) pairs, leading to constant retrieval time. The LRU requirement is more tricky. The first thing that one can think of is to use a queue to store the N most recently accessed keys. When the size of the queue exceeds N, we could just pop it and remove that key from the table as well. This strategy however will not work: what if the same key was accessed N times? We would be removing the only object in the cache! Moreover, if we store in the queue the same object for N times, we are basically storing just the last object used, not the last N. Replacing the queue with a linked list and scanning the whole list every time might fix this problem but it will make the access time linear in the size of the cache, i.e., O(N), which would completely defeat the purpose.
There are probably multiple ways to fix this problem, but I think the most elegant and efficient solution is to maintain a doubly linked list of (Key,Data) pairs and to store the nodes of the list in a hash table indexed by the key. When the get method is called, the node can be retrieved and can be put at the top of the linked list. We can easily make sure that the list never exceeds N elements by cutting its tail (and removing corresponding objects from the table), which will contain the least recently used keys.
Here's a java implementation of this idea. The interface DataSource simply defines the get(Key) method returning a Data object.
 import java.util.Hashtable;  
 public class LRUCache<Key, Data> implements DataSource<Key, Data> {  
      private final DataSource<Key, Data> source;  
      private final Hashtable<Key, Node> cache;  
      private final int maxRecords;  
      private Node head = null, tail = null;  
      public LRUCache(DataSource<Key, Data> source, int maxRecords) {  
           this.source = source;  
           this.maxRecords = maxRecords;  
           cache = new Hashtable<>();  
      }  
      @Override  
      public Data get(Key key) {  
           if (cache.containsKey(key)) {// the data is in the cache  
                Node node = cache.get(key);  
                if (node != head) {  
                     if (node == tail)  
                          tail = tail.prev;  
                     if (node.prev != null)  
                          node.prev.next = node.next;  
                     if (node.next != null)  
                          node.next.prev = node.prev;  
                     node.prev = null;  
                     node.next = head;  
                     if (head != null)  
                          head.prev = node;  
                     else  
                          tail = node;  
                     head = node;  
                }  
                // return the data  
                return node.data;  
           }  
           // retrieve the data from the source  
           Data data = source.get(key);  
           // put the data at the head of the list and into the table  
           Node node = new Node(key, data);  
           cache.put(key, node);  
           node.next = head;  
           if (head != null)  
                head.prev = node;  
           else  
                tail = node;  
           head = node;  
           while (cache.size() > maxRecords)  
                if (tail != null) {  
                     cache.remove(tail.key);  
                     if (tail.prev != null)  
                          tail.prev.next = null;  
                     tail = tail.prev;  
                }  
           return data;  
      }  
      private class Node {  
           public Node(Key key, Data data) {  
                this.key = key;  
                this.data = data;  
           }  
           final Key key;  
           final Data data;  
           Node prev;  
           Node next;  
      }  
 }  

It's worth noting that this implementation is NOT concurrent. Designing a concurrent LRU cache is a little more tricky: if we synchronize the get() method then the cache might block for a long time when accessing the data source (say, a database), while it could be responding to clients that are querying data that's already cached. A simple idea would be to synchronize the blocks of code that access the cache in the get() method and leave to the external DataSource object the task of synchronizing access to external resources. In the code above, the if statement in the get would have to be synchronized and so would the block going from the definition of the new node to right before returning. If a concurrent hash map was used, synchronization of the if statement itself wouldn't be necessary, just the if block would have to be synchronized.

Thursday, April 12, 2012

Dynamic Programming and Memoization

I love dynamic programming. It's a great technique to solve many optimization problems. Now if you're familiar with dynamic programming, you know that it basically consists in solving a problem with optimal sub-structure, i.e., where the optimal solution is obtained by solving one or more smaller sub-problems of the same kind, and overlapping sub-problems, i.e., when the solution is obtained by repeatedly solving the same sub-problems over and over. Dynamic programming algorithms can usually be written in a recursive fashion, i.e., with a top-down approach. For example an algorithm to compute Fibonacci numbers might call itself recursively. The algorithm can be "memoized" by storing the value of the i-th Fibonacci number right after it's computed the first time and then use the stored values when needed. Writing algorithms in a top-down fashion is usually more intuitive and easy to understand, e.g., Fn = Fn-1+Fn-2 for the Fibonacci sequence. Memoization makes sure the algorithm doesn't compute the same Fibonacci number twice, thus the running time is O(n). Note that there are smarter ways to compute Fibonacci numbers, e.g., using the recursive formulas F2n-1 = Fn2+Fn-12 and F2n = (2Fn-1+Fn)Fn, which lead to logarithmic running time.
There are also smarter ways of writing dynamic programming algorithms. In fact, memoization is great because it allows writing code in a natural way, but it leaves us with a lot of overhead due to the call stack. The solution is to write the algorithm in a bottom-up fashion instead. This means solving the simplest sub-problems first and then using them to solve the higher level sub-problems in progressive order. Going back to our Fibonacci sequence, this means computing the elements of the sequence one after the other starting from F0 = 0 and F1 = 1, F2 = F0+F1 and so on. The recursive program can be now written in iterative form, e.g., in a for loop. It's worth noting that the asymptotic running time is still O(n), but the time saved by reducing the call stack overhead will noticeably improve the constant factors, leading to a faster algorithm.
It's interesting to have an idea of this improvement. I decided to implement the cutting-rod problem with both approaches and compare the running times. I'm not going to describe the problem or the solution here, since it's really a textbook example. The figure below shows a plot of the running time for the two implementations of the rod cutting algorithm: the bottom-up one is obviously the winner with a constant factor about 3 times smaller than the top-down one (both algorithms run in O(n2)).


And here's the java code used to obtain the data in the plot. Enjoy!

 import java.util.Arrays;  
 import java.util.Random;  
 public class RodCutting {  
      public static void main(String[] args) {  
           // the random seed is chosen arbitrarily  
           Random rnd = new Random(2358761235817L);  
           // the class RodCutting contains the two algorithms  
           RodCutting cutRod = new RodCutting();  
           // this creates an array of rod lengths to test. 30 is the interval  
           // between lengths and 1500 the maximum value in the array, thus the  
           // array is {30,60,...,1470,1500}.  
           int rodLengths[] = computeRodLengths(30, 1500);  
           // generate 50 test cases to get consistent results  
           int nCases = 50;  
           // maximum price per rod piece  
           int max = 50;  
           // computing times in ns are stored in these two arrays  
           long[] computingTimeMemoized = new long[rodLengths.length];  
           long[] computingTimeBottomUp = new long[rodLengths.length];  
           for (int i = 0; i < nCases; i++) {  
                // create a test case  
                int[] p = generateTestCase(rodLengths[rodLengths.length - 1], max, rnd);  
                // run the algorithm for each selected rod length  
                for (int j = 0; j < rodLengths.length; j++) {  
                     computingTimeMemoized[j] += testMemoized(p, rodLengths[j], cutRod);  
                     computingTimeBottomUp[j] += testBottomUp(p, rodLengths[j], cutRod);  
                }  
           }  
           System.out.printf("%20s %20s %20s\n", "rod length",  
                     "memoized time [ns]", "bottom-up time [ns]");  
           for (int i = 0; i < rodLengths.length; i++) {  
                computingTimeMemoized[i] /= nCases;  
                computingTimeBottomUp[i] /= nCases;  
                System.out.printf("%20d %20d %20d\n", rodLengths[i],  
                          computingTimeMemoized[i], computingTimeBottomUp[i]);  
           }  
      }  
      public int cutRodMemoized(int[] p, int n) {  
           int r[] = new int[p.length + 1];  
           Arrays.fill(r, Integer.MIN_VALUE);  
           return cutRodMemoizedAux(p, n, r);  
      }  
      private int cutRodMemoizedAux(int[] p, int n, int[] r) {  
           if (r[n] >= 0)  
                return r[n];  
           int q = 0;  
           if (n != 0) {  
                q = Integer.MIN_VALUE;  
                for (int i = 1; i <= n; i++)  
                     q = Math.max(q, p[i - 1] + cutRodMemoizedAux(p, n - i, r));  
           }  
           r[n] = q;  
           return q;  
      }  
      public int cutRodBottomUp(int[] p, int n) {  
           int[] r = new int[p.length + 1];  
           r[0] = 0;  
           for (int j = 1; j <= n; j++) {  
                int q = Integer.MIN_VALUE;  
                for (int i = 1; i <= j; i++)  
                     q = Math.max(q, p[i - 1] + r[j - i]);  
                r[j] = q;  
           }  
           return r[n];  
      }  
      private static int[] computeRodLengths(int interval, int max) {  
           int l = max / interval;  
           int[] ret = new int[l];  
           for (int i = 0; i < l; i++)  
                ret[i] = interval * (i + 1);  
           return ret;  
      }  
      private static long testMemoized(int[] p, int n, RodCutting cutRod) {  
           long start = System.nanoTime();  
           cutRod.cutRodMemoized(p, n);  
           return System.nanoTime() - start;  
      }  
      private static long testBottomUp(int[] p, int n, RodCutting cutRod) {  
           long start = System.nanoTime();  
           cutRod.cutRodBottomUp(p, n);  
           return System.nanoTime() - start;  
      }  
      private static int[] generateTestCase(int n, int max, Random rnd) {  
           int[] ret = new int[n];  
           for (int i = 0; i < n; i++)  
                ret[i] = rnd.nextInt(max);  
           return ret;  
      }  
 }