0

i'm new to the Hadoop world and i'm having some trouble with my final data.

My purpose is to extract data from a facebook page (i'm using restfb API) using flume, then the data goes to HDFS which will be used by HIVE to gerenerate the final data. This happens every hour. All this on HUE.

I don't know why, but sometimes I success in extract data from the hole day. And some days, I can only extract data from a few hours.

This is the data from Flume:

enter image description here

As you can see, on 03/21 I could only extract the first 4h from the day. While on 03/22, I could extract the hole day.

Some more info. enter image description here

My Flume config. from Cloudera Manager

FacebookAgent.sources = FacebookPageFansCity FacebookPageFansGenderAge FacebookPageFans FacebookPagePosts FacebookPageViews
FacebookAgent.channels = MemoryChannelFacebookPageFansCity MemoryChannelFacebookPageFansGenderAge MemoryChannelFacebookPageFans MemoryChannelFacebookPagePosts MemoryChannelFacebookPageViews
FacebookAgent.sinks = HDFSFacebookPageFansCity HDFSFacebookPageFansGenderAge HDFSFacebookPageFans HDFSFacebookPagePosts HDFSFacebookPageViews

# FacebookPageFansCity

FacebookAgent.sources.FacebookPageFansCity.type = br.com.tsystems.hadoop.flume.source.restfb.FacebookPageFansCitySource
FacebookAgent.sources.FacebookPageFansCity.channels = MemoryChannelFacebookPageFansCity
FacebookAgent.sources.FacebookPageFansCity.appId = null
FacebookAgent.sources.FacebookPageFansCity.appSecret = null
FacebookAgent.sources.FacebookPageFansCity.accessToken = *confidential*
FacebookAgent.sources.FacebookPageFansCity.pageId = *confidential*
FacebookAgent.sources.FacebookPageFansCity.proxyEnabled = false
FacebookAgent.sources.FacebookPageFansCity.proxyHost = null
FacebookAgent.sources.FacebookPageFansCity.proxyPort = -1
FacebookAgent.sources.FacebookPageFansCity.refreshInterval = 3600

FacebookAgent.sinks.HDFSFacebookPageFansCity.channel = MemoryChannelFacebookPageFansCity
FacebookAgent.sinks.HDFSFacebookPageFansCity.type = hdfs
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.path = hdfs://hdoop01:8020/user/flume/pocfacebook/pagefanscity/%Y%m%d%H
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.fileType = DataStream
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.writeFormat = Text
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.batchSize = 1000
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.rollSize = 0
FacebookAgent.sinks.HDFSFacebookPageFansCity.hdfs.rollCount = 10000

FacebookAgent.channels.MemoryChannelFacebookPageFansCity.type = memory
FacebookAgent.channels.MemoryChannelFacebookPageFansCity.capacity = 10000
FacebookAgent.channels.MemoryChannelFacebookPageFansCity.transactionCapacity = 1000

# FacebookPageFansGenderAge

FacebookAgent.sources.FacebookPageFansGenderAge.type = br.com.tsystems.hadoop.flume.source.restfb.FacebookPageFansGenderAgeSource
FacebookAgent.sources.FacebookPageFansGenderAge.channels = MemoryChannelFacebookPageFansGenderAge
FacebookAgent.sources.FacebookPageFansGenderAge.appId = null
FacebookAgent.sources.FacebookPageFansGenderAge.appSecret = null
FacebookAgent.sources.FacebookPageFansGenderAge.accessToken = *confidential*
FacebookAgent.sources.FacebookPageFansGenderAge.pageId = *confidential*
FacebookAgent.sources.FacebookPageFansGenderAge.proxyEnabled = false
FacebookAgent.sources.FacebookPageFansGenderAge.proxyHost = null
FacebookAgent.sources.FacebookPageFansGenderAge.proxyPort = -1
FacebookAgent.sources.FacebookPageFansGenderAge.refreshInterval = 3600

FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.channel = MemoryChannelFacebookPageFansGenderAge
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.type = hdfs
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.path = hdfs://hdoop01:8020/user/flume/pocfacebook/pagefansgenderage/%Y%m%d%H
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.fileType = DataStream
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.writeFormat = Text
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.batchSize = 1000
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.rollSize = 0
FacebookAgent.sinks.HDFSFacebookPageFansGenderAge.hdfs.rollCount = 10000

FacebookAgent.channels.MemoryChannelFacebookPageFansGenderAge.type = memory
FacebookAgent.channels.MemoryChannelFacebookPageFansGenderAge.capacity = 10000
FacebookAgent.channels.MemoryChannelFacebookPageFansGenderAge.transactionCapacity = 1000

# FacebookPageFans

FacebookAgent.sources.FacebookPageFans.type = br.com.tsystems.hadoop.flume.source.restfb.FacebookPageFansSource
FacebookAgent.sources.FacebookPageFans.channels = MemoryChannelFacebookPageFans
FacebookAgent.sources.FacebookPageFans.appId = null
FacebookAgent.sources.FacebookPageFans.appSecret = null
FacebookAgent.sources.FacebookPageFans.accessToken = *confidential*
FacebookAgent.sources.FacebookPageFans.pageId = *confidential*
FacebookAgent.sources.FacebookPageFans.proxyEnabled = false
FacebookAgent.sources.FacebookPageFans.proxyHost = null
FacebookAgent.sources.FacebookPageFans.proxyPort = -1
FacebookAgent.sources.FacebookPageFans.refreshInterval = 3600

FacebookAgent.sinks.HDFSFacebookPageFans.channel = MemoryChannelFacebookPageFans
FacebookAgent.sinks.HDFSFacebookPageFans.type = hdfs
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.path = hdfs://hdoop01:8020/user/flume/pocfacebook/pagefans/%Y%m%d%H
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.fileType = DataStream
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.writeFormat = Text
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.batchSize = 1000
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.rollSize = 0
FacebookAgent.sinks.HDFSFacebookPageFans.hdfs.rollCount = 10000

FacebookAgent.channels.MemoryChannelFacebookPageFans.type = memory
FacebookAgent.channels.MemoryChannelFacebookPageFans.capacity = 10000
FacebookAgent.channels.MemoryChannelFacebookPageFans.transactionCapacity = 1000

# FacebookPagePosts

FacebookAgent.sources.FacebookPagePosts.type = br.com.tsystems.hadoop.flume.source.restfb.FacebookPagePostsSource
FacebookAgent.sources.FacebookPagePosts.channels = MemoryChannelFacebookPagePosts
FacebookAgent.sources.FacebookPagePosts.appId = null
FacebookAgent.sources.FacebookPagePosts.appSecret = null
FacebookAgent.sources.FacebookPagePosts.accessToken = *confidential*
FacebookAgent.sources.FacebookPagePosts.pageId = *confidential*
FacebookAgent.sources.FacebookPagePosts.proxyEnabled = false
FacebookAgent.sources.FacebookPagePosts.proxyHost = null
FacebookAgent.sources.FacebookPagePosts.proxyPort = -1
FacebookAgent.sources.FacebookPagePosts.refreshInterval = 3600

FacebookAgent.sinks.HDFSFacebookPagePosts.channel = MemoryChannelFacebookPagePosts
FacebookAgent.sinks.HDFSFacebookPagePosts.type = hdfs
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.path = hdfs://hdoop01:8020/user/flume/pocfacebook/pageposts/%Y%m%d%H
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.fileType = DataStream
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.writeFormat = Text
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.batchSize = 1000
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.rollSize = 0
FacebookAgent.sinks.HDFSFacebookPagePosts.hdfs.rollCount = 10000

FacebookAgent.channels.MemoryChannelFacebookPagePosts.type = memory
FacebookAgent.channels.MemoryChannelFacebookPagePosts.capacity = 10000
FacebookAgent.channels.MemoryChannelFacebookPagePosts.transactionCapacity = 5000

# FacebookPageViews

FacebookAgent.sources.FacebookPageViews.type = br.com.tsystems.hadoop.flume.source.restfb.FacebookPageViewsSource
FacebookAgent.sources.FacebookPageViews.channels = MemoryChannelFacebookPageViews
FacebookAgent.sources.FacebookPageViews.appId = null
FacebookAgent.sources.FacebookPageViews.appSecret = null
FacebookAgent.sources.FacebookPageViews.accessToken = *confidential*
FacebookAgent.sources.FacebookPageViews.pageId = *confidential*
FacebookAgent.sources.FacebookPageViews.proxyEnabled = false
FacebookAgent.sources.FacebookPageViews.proxyHost = null
FacebookAgent.sources.FacebookPageViews.proxyPort = -1
FacebookAgent.sources.FacebookPageViews.refreshInterval = 3600

FacebookAgent.sinks.HDFSFacebookPageViews.channel = MemoryChannelFacebookPageViews
FacebookAgent.sinks.HDFSFacebookPageViews.type = hdfs
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.path = hdfs://hdoop01:8020/user/flume/pocfacebook/pageviews/%Y%m%d%H
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.fileType = DataStream
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.writeFormat = Text
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.batchSize = 1000
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.rollSize = 0
FacebookAgent.sinks.HDFSFacebookPageViews.hdfs.rollCount = 10000

FacebookAgent.channels.MemoryChannelFacebookPageViews.type = memory
FacebookAgent.channels.MemoryChannelFacebookPageViews.capacity = 10000
FacebookAgent.channels.MemoryChannelFacebookPageViews.transactionCapacity = 1000

Can anybody help me?

UPDATE

My coordinator from Oozie

enter image description here

Gabriel Braga
  • 41
  • 1
  • 10
  • You should probably use a bigger aggregation time window, like every 30 minutes or 1 hour if you are using Oozie (~ not really real time). – Romain Apr 06 '15 at 21:25
  • @Romain, I update my post with a image from my coordinator. As you can see, it is alrealdy set work every hour. – Gabriel Braga Apr 10 '15 at 14:44

0 Answers0